diff --git a/.bazelrc b/.bazelrc
index e21a1a32917..f11c376df65 100644
--- a/.bazelrc
+++ b/.bazelrc
@@ -39,6 +39,7 @@
 #
 # Feature and Third party library support options:
 #     xla:          Build TF with XLA
+#     tpu:          Build TF with TPU support
 #     using_cuda:   CUDA is available to build system.
 #     cuda:         Build with full cuda support.
 #     rocm:         Build with AMD GPU support (rocm).
@@ -57,13 +58,12 @@
 #
 #
 # Remote build execution options (only configured to work with TF team projects for now.)
-#     rbe:        General RBE options shared by all flavors.
-#     rbe_linux:  General RBE options used on all linux builds.
-#     rbe_win:    General RBE options used on all windows builds.
+#     rbe:       General RBE options shared by all flavors.
+#     rbe_linux: General RBE options used on all linux builds.
+#     rbe_win:   General RBE options used on all windows builds.
 #
-#     rbe_cpu_linux:        RBE options to build with only CPU support.
-#     rbe_linux_cuda_nvcc:  RBE options to build with GPU support using nvcc.
-#     rbe_gpu_linux:        An alias for rbe_linux_cuda_nvcc
+#     rbe_cpu_linux:           RBE options to build with only CPU support.
+#     rbe_linux_cuda_nvcc_py*: RBE options to build with GPU support using nvcc.
 #
 #     rbe_linux_py2: Linux Python 2 RBE config.
 #     rbe_linux_py3: Linux Python 3 RBE config
@@ -180,6 +180,9 @@ build:dbg --cxxopt -DTF_LITE_DISABLE_X86_NEON
 # AWS SDK must be compiled in release mode. see: https://github.com/tensorflow/tensorflow/issues/37498
 build:dbg --copt -DDEBUG_BUILD
 
+# Config to build TPU backend
+build:tpu --define=with_tpu_support=true
+
 build:tensorrt --action_env TF_NEED_TENSORRT=1
 
 build:rocm --crosstool_top=@local_config_rocm//crosstool:toolchain
@@ -396,33 +399,48 @@ build:rbe_linux_cuda_base --repo_env=REMOTE_GPU_TESTING=1
 build:rbe_linux_cuda_base --repo_env=TF_NEED_CUDA=1
 test:rbe_linux_cuda_base --test_env=LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"
 
-build:rbe_linux_cuda_nvcc --config=rbe_linux_cuda_base
-build:rbe_linux_cuda_nvcc --crosstool_top="@ubuntu18.04-py3-gcc7_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_cuda//crosstool:toolchain"
-build:rbe_linux_cuda_nvcc --extra_toolchains="@ubuntu18.04-py3-gcc7_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_cuda//crosstool:toolchain-linux-x86_64"
-build:rbe_linux_cuda_nvcc --extra_execution_platforms="@ubuntu18.04-py3-gcc7_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_platform//:platform"
-build:rbe_linux_cuda_nvcc --host_platform="@ubuntu18.04-py3-gcc7_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_platform//:platform"
-build:rbe_linux_cuda_nvcc --platforms="@ubuntu18.04-py3-gcc7_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_platform//:platform"
-build:rbe_linux_cuda_nvcc --repo_env=TF_CUDA_CONFIG_REPO="@ubuntu18.04-py3-gcc7_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_cuda"
-build:rbe_linux_cuda_nvcc --repo_env=TF_TENSORRT_CONFIG_REPO="@ubuntu18.04-py3-gcc7_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_tensorrt"
-build:rbe_linux_cuda_nvcc --repo_env=TF_NCCL_CONFIG_REPO="@ubuntu18.04-py3-gcc7_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_nccl"
-build:rbe_linux_cuda_nvcc --define=using_cuda_nvcc=true
-test:rbe_linux_cuda_nvcc --config=rbe_linux_cuda_base
+build:rbe_linux_cuda10.1_nvcc_base --config=rbe_linux_cuda_base
+build:rbe_linux_cuda10.1_nvcc_base --define=using_cuda_nvcc=true
+build:rbe_linux_cuda10.1_nvcc_base --crosstool_top="@ubuntu18.04-gcc7_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_cuda//crosstool:toolchain"
+build:rbe_linux_cuda10.1_nvcc_base --extra_toolchains="@ubuntu18.04-gcc7_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_cuda//crosstool:toolchain-linux-x86_64"
+build:rbe_linux_cuda10.1_nvcc_base --extra_execution_platforms="@ubuntu18.04-gcc7_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_platform//:platform"
+build:rbe_linux_cuda10.1_nvcc_base --host_platform="@ubuntu18.04-gcc7_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_platform//:platform"
+build:rbe_linux_cuda10.1_nvcc_base --platforms="@ubuntu18.04-gcc7_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_platform//:platform"
+build:rbe_linux_cuda10.1_nvcc_base --repo_env=TF_CUDA_CONFIG_REPO="@ubuntu18.04-gcc7_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_cuda"
+build:rbe_linux_cuda10.1_nvcc_base --repo_env=TF_TENSORRT_CONFIG_REPO="@ubuntu18.04-gcc7_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_tensorrt"
+build:rbe_linux_cuda10.1_nvcc_base --repo_env=TF_NCCL_CONFIG_REPO="@ubuntu18.04-gcc7_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_nccl"
+build:rbe_linux_cuda10.1_nvcc_py2.7 --config=rbe_linux_cuda10.1_nvcc_base --repo_env=TF_PYTHON_CONFIG_REPO="@ubuntu18.04-gcc7_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_python2.7"
+build:rbe_linux_cuda10.1_nvcc_py3.5 --config=rbe_linux_cuda10.1_nvcc_base --repo_env=TF_PYTHON_CONFIG_REPO="@ubuntu18.04-gcc7_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_python3.5"
+build:rbe_linux_cuda10.1_nvcc_py3.6 --config=rbe_linux_cuda10.1_nvcc_base --repo_env=TF_PYTHON_CONFIG_REPO="@ubuntu18.04-gcc7_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_python3.6"
+build:rbe_linux_cuda10.1_nvcc_py3.7 --config=rbe_linux_cuda10.1_nvcc_base --repo_env=TF_PYTHON_CONFIG_REPO="@ubuntu18.04-gcc7_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_python3.7"
+build:rbe_linux_cuda10.1_nvcc_py3.8 --config=rbe_linux_cuda10.1_nvcc_base --repo_env=TF_PYTHON_CONFIG_REPO="@ubuntu18.04-gcc7_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_python3.8"
 
-build:rbe_linux_cuda_nvcc_base --config=rbe_linux_cuda_base
-build:rbe_linux_cuda_nvcc_base --crosstool_top="@ubuntu18.04-gcc7_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_cuda//crosstool:toolchain"
-build:rbe_linux_cuda_nvcc_base --extra_toolchains="@ubuntu18.04-gcc7_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_cuda//crosstool:toolchain-linux-x86_64"
-build:rbe_linux_cuda_nvcc_base --extra_execution_platforms="@ubuntu18.04-gcc7_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_platform//:platform"
-build:rbe_linux_cuda_nvcc_base --host_platform="@ubuntu18.04-gcc7_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_platform//:platform"
-build:rbe_linux_cuda_nvcc_base --platforms="@ubuntu18.04-gcc7_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_platform//:platform"
-build:rbe_linux_cuda_nvcc_base --repo_env=TF_CUDA_CONFIG_REPO="@ubuntu18.04-gcc7_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_cuda"
-build:rbe_linux_cuda_nvcc_base --repo_env=TF_TENSORRT_CONFIG_REPO="@ubuntu18.04-gcc7_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_tensorrt"
-build:rbe_linux_cuda_nvcc_base --repo_env=TF_NCCL_CONFIG_REPO="@ubuntu18.04-gcc7_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_nccl"
-build:rbe_linux_cuda_nvcc_base --define=using_cuda_nvcc=true
-build:rbe_linux_cuda_nvcc_py27 --config=rbe_linux_cuda_nvcc_base --repo_env=TF_PYTHON_CONFIG_REPO="@ubuntu18.04-gcc7_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_python2.7"
-build:rbe_linux_cuda_nvcc_py35 --config=rbe_linux_cuda_nvcc_base --repo_env=TF_PYTHON_CONFIG_REPO="@ubuntu18.04-gcc7_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_python3.5"
-build:rbe_linux_cuda_nvcc_py36 --config=rbe_linux_cuda_nvcc_base --repo_env=TF_PYTHON_CONFIG_REPO="@ubuntu18.04-gcc7_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_python3.6"
-build:rbe_linux_cuda_nvcc_py37 --config=rbe_linux_cuda_nvcc_base --repo_env=TF_PYTHON_CONFIG_REPO="@ubuntu18.04-gcc7_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_python3.7"
-build:rbe_linux_cuda_nvcc_py38 --config=rbe_linux_cuda_nvcc_base --repo_env=TF_PYTHON_CONFIG_REPO="@ubuntu18.04-gcc7_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_python3.8"
+build:rbe_linux_cuda11.0_nvcc_base --config=rbe_linux_cuda_base
+build:rbe_linux_cuda11.0_nvcc_base --define=using_cuda_nvcc=true
+build:rbe_linux_cuda11.0_nvcc_base --crosstool_top="@ubuntu18.04-gcc7_manylinux2010-cuda11.0-cudnn8-tensorrt7.1_config_cuda//crosstool:toolchain"
+build:rbe_linux_cuda11.0_nvcc_base --extra_toolchains="@ubuntu18.04-gcc7_manylinux2010-cuda11.0-cudnn8-tensorrt7.1_config_cuda//crosstool:toolchain-linux-x86_64"
+build:rbe_linux_cuda11.0_nvcc_base --extra_execution_platforms="@ubuntu18.04-gcc7_manylinux2010-cuda11.0-cudnn8-tensorrt7.1_config_platform//:platform"
+build:rbe_linux_cuda11.0_nvcc_base --host_platform="@ubuntu18.04-gcc7_manylinux2010-cuda11.0-cudnn8-tensorrt7.1_config_platform//:platform"
+build:rbe_linux_cuda11.0_nvcc_base --platforms="@ubuntu18.04-gcc7_manylinux2010-cuda11.0-cudnn8-tensorrt7.1_config_platform//:platform"
+build:rbe_linux_cuda11.0_nvcc_base --repo_env=TF_CUDA_CONFIG_REPO="@ubuntu18.04-gcc7_manylinux2010-cuda11.0-cudnn8-tensorrt7.1_config_cuda"
+build:rbe_linux_cuda11.0_nvcc_base --repo_env=TF_TENSORRT_CONFIG_REPO="@ubuntu18.04-gcc7_manylinux2010-cuda11.0-cudnn8-tensorrt7.1_config_tensorrt"
+build:rbe_linux_cuda11.0_nvcc_base --repo_env=TF_NCCL_CONFIG_REPO="@ubuntu18.04-gcc7_manylinux2010-cuda11.0-cudnn8-tensorrt7.1_config_nccl"
+build:rbe_linux_cuda11.0_nvcc_py2.7 --config=rbe_linux_cuda11.0_nvcc_base --repo_env=TF_PYTHON_CONFIG_REPO="@ubuntu18.04-gcc7_manylinux2010-cuda11.0-cudnn8-tensorrt7.1_config_python2.7"
+build:rbe_linux_cuda11.0_nvcc_py3.5 --config=rbe_linux_cuda11.0_nvcc_base --repo_env=TF_PYTHON_CONFIG_REPO="@ubuntu18.04-gcc7_manylinux2010-cuda11.0-cudnn8-tensorrt7.1_config_python3.5"
+build:rbe_linux_cuda11.0_nvcc_py3.6 --config=rbe_linux_cuda11.0_nvcc_base --repo_env=TF_PYTHON_CONFIG_REPO="@ubuntu18.04-gcc7_manylinux2010-cuda11.0-cudnn8-tensorrt7.1_config_python3.6"
+build:rbe_linux_cuda11.0_nvcc_py3.7 --config=rbe_linux_cuda11.0_nvcc_base --repo_env=TF_PYTHON_CONFIG_REPO="@ubuntu18.04-gcc7_manylinux2010-cuda11.0-cudnn8-tensorrt7.1_config_python3.7"
+build:rbe_linux_cuda11.0_nvcc_py3.8 --config=rbe_linux_cuda11.0_nvcc_base --repo_env=TF_PYTHON_CONFIG_REPO="@ubuntu18.04-gcc7_manylinux2010-cuda11.0-cudnn8-tensorrt7.1_config_python3.8"
+
+# Map default to CUDA 10.1.
+build:rbe_linux_cuda_nvcc_py27 --config=rbe_linux_cuda10.1_nvcc_py2.7
+build:rbe_linux_cuda_nvcc_py35 --config=rbe_linux_cuda10.1_nvcc_py3.5
+build:rbe_linux_cuda_nvcc_py36 --config=rbe_linux_cuda10.1_nvcc_py3.6
+build:rbe_linux_cuda_nvcc_py37 --config=rbe_linux_cuda10.1_nvcc_py3.7
+build:rbe_linux_cuda_nvcc_py38 --config=rbe_linux_cuda10.1_nvcc_py3.8
+
+# Deprecated configs that people might still use.
+build:rbe_linux_cuda_nvcc --config=rbe_linux_cuda_nvcc_py36
+build:rbe_gpu_linux       --config=rbe_linux_cuda_nvcc
 
 build:rbe_linux_cuda_clang_base --config=rbe_linux_cuda_base
 build:rbe_linux_cuda_clang_base --crosstool_top="@ubuntu16.04-clang_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_cuda//crosstool:toolchain"
@@ -440,8 +458,6 @@ build:rbe_linux_cuda_clang_py36 --config=rbe_linux_cuda_clang_base --repo_env=TF
 build:rbe_linux_cuda_clang_py37 --config=rbe_linux_cuda_clang_base --repo_env=TF_PYTHON_CONFIG_REPO="@ubuntu16.04-clang_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_python3.7"
 build:rbe_linux_cuda_clang_py38 --config=rbe_linux_cuda_clang_base --repo_env=TF_PYTHON_CONFIG_REPO="@ubuntu16.04-clang_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_python3.8"
 
-common:rbe_gpu_linux --config=rbe_linux_cuda_nvcc
-
 build:rbe_linux_py2 --config=rbe_linux
 build:rbe_linux_py2 --repo_env=PYTHON_BIN_PATH="/usr/bin/python2"
 build:rbe_linux_py2 --python_path="/usr/bin/python2"
diff --git a/README.md b/README.md
index 54c9470b04b..73a345706a4 100644
--- a/README.md
+++ b/README.md
@@ -95,6 +95,7 @@ for general questions and discussion, and please direct specific questions to
 The TensorFlow project strives to abide by generally accepted best practices in
 open-source software development:
 
+[![Fuzzing Status](https://oss-fuzz-build-logs.storage.googleapis.com/badges/tensorflow.svg)](https://bugs.chromium.org/p/oss-fuzz/issues/list?sort=-opened&can=1&q=proj:tensorflow)
 [![CII Best Practices](https://bestpractices.coreinfrastructure.org/projects/1486/badge)](https://bestpractices.coreinfrastructure.org/projects/1486)
 [![Contributor Covenant](https://img.shields.io/badge/Contributor%20Covenant-v1.4%20adopted-ff69b4.svg)](CODE_OF_CONDUCT.md)
 
diff --git a/RELEASE.md b/RELEASE.md
index f93626cc876..68d9399676a 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -1,3 +1,57 @@
+# Release 2.4.0
+
+<INSERT SMALL BLURB ABOUT RELEASE FOCUS AREA AND POTENTIAL TOOLCHAIN CHANGES>
+
+## Breaking Changes
+
+* <DOCUMENT BREAKING CHANGES HERE>
+* <THIS SECTION SHOULD CONTAIN API, ABI AND BEHAVIORAL BREAKING CHANGES>
+
+## Known Caveats
+
+* <CAVEATS REGARDING THE RELEASE (BUT NOT BREAKING CHANGES). E.G. ADDING A NEW DEPENDENCY, BUMPING A DEPENDENCY NUMBER, LACK OF SUPPORT ON SOME PLATFORM, ETC>
+
+## Major Features and Improvements
+
+* <INSERT MAJOR FEATURE HERE, USING MARKDOWN SYNTAX>
+* <IF RELEASE CONTAINS MULTIPLE FEATURES FROM SAME AREA, GROUP THEM TOGETHER>
+
+## Bug Fixes and Other Changes
+
+* <SIMILAR TO ABOVE SECTION, BUT FOR OTHER IMPORTANT CHANGES / BUG FIXES>
+* <IF A CHANGE CLOSES A GITHUB ISSUE, IT SHOULD BE DOCUMENTED HERE>
+* <NOTES SHOULD BE GROUPED PER AREA>
+* TF Core:
+    * <ADD RELEASE NOTES HERE>
+* `tf.data`:
+    * <ADD RELEASE NOTES HERE>
+*   `tf.distribute`:
+    * <ADD RELEASE NOTES HERE>
+*   `tf.keras`:
+    * <ADD RELEASE NOTES HERE>
+*   `tf.function`/AutoGraph:
+    * <ADD RELEASE NOTES HERE>
+*   `tf.lite`:
+    * <ADD RELEASE NOTES HERE>
+*   `tf.random`:
+    * <ADD RELEASE NOTES HERE>
+*   Math and Linear Algebra:
+    * <ADD RELEASE NOTES HERE>
+*   TPU Enhancements:
+    * <ADD RELEASE NOTES HERE>
+*   XLA Support:
+    * <ADD RELEASE NOTES HERE>
+*   Tracing and Debugging:
+    * <ADD RELEASE NOTES HERE>
+*   Other:
+    * <ADD RELEASE NOTES HERE>
+
+## Thanks to our Contributors
+
+This release contains contributions from many people at Google, as well as:
+
+<INSERT>, <NAME>, <HERE>, <USING>, <GITHUB>, <HANDLE>
+
 # Release 2.3.0
 
 ## Breaking Changes
diff --git a/tensorflow/BUILD b/tensorflow/BUILD
index bd0619b0c05..d00608ccc98 100644
--- a/tensorflow/BUILD
+++ b/tensorflow/BUILD
@@ -467,6 +467,13 @@ config_setting(
     visibility = ["//visibility:public"],
 )
 
+# This flag enables experimental TPU support
+config_setting(
+    name = "with_tpu_support",
+    values = {"define": "with_tpu_support=true"},
+    visibility = ["//visibility:public"],
+)
+
 # Specifies via a config setting if this is a mobile build or not, makes
 # it easier to combine settings later.
 selects.config_setting_group(
diff --git a/tensorflow/c/c_api_experimental.cc b/tensorflow/c/c_api_experimental.cc
index e9e6d470c68..831c6a0ad40 100644
--- a/tensorflow/c/c_api_experimental.cc
+++ b/tensorflow/c/c_api_experimental.cc
@@ -624,7 +624,7 @@ void TFE_InferShapes(TFE_Op* tfe_op, TF_ShapeAndTypeList* input_shapes,
 
   const int num_inputs = input_shapes->num_items;
   NodeDef node_def;
-  tensorflow::AbstractOperationInterface* op = tensorflow::unwrap(tfe_op);
+  tensorflow::ImmediateExecutionOperation* op = tensorflow::unwrap(tfe_op);
   node_def.set_name(op->Name());
   node_def.set_op(op->Name());
   for (int i = 0; i < num_inputs; ++i) {
diff --git a/tensorflow/c/eager/BUILD b/tensorflow/c/eager/BUILD
index 9d3c79e0ae7..5f7ab4a1f59 100644
--- a/tensorflow/c/eager/BUILD
+++ b/tensorflow/c/eager/BUILD
@@ -38,9 +38,10 @@ tf_cuda_library(
             "//tensorflow/core:portable_tensorflow_lib_lite",
         ],
         "//conditions:default": [
-            ":context_interface",
-            ":operation_interface",
-            ":tensor_handle_interface",
+            ":immediate_execution_context",
+            ":immediate_execution_operation",
+            ":immediate_execution_tensor_handle",
+            ":abstract_tensor_handle",
             ":tfe_context_internal",
             ":tfe_cancellation_manager_internal",
             ":tfe_executor_internal",
@@ -101,13 +102,17 @@ tf_cuda_library(
 filegroup(
     name = "pywrap_required_hdrs",
     srcs = [
+        "abstract_context.h",
+        "abstract_function.h",
+        "abstract_operation.h",
+        "abstract_tensor_handle.h",
         "c_api_experimental.h",
         "c_api_internal.h",
         "c_api_unified_experimental.h",
-        "context_interface.h",
         "dlpack.h",
-        "operation_interface.h",
-        "tensor_handle_interface.h",
+        "immediate_execution_context.h",
+        "immediate_execution_operation.h",
+        "immediate_execution_tensor_handle.h",
         "tfe_cancellation_manager_internal.h",
         "tfe_executor_internal.h",
         "tfe_monitoring_internal.h",
@@ -163,12 +168,22 @@ cc_library(
 )
 
 cc_library(
-    name = "tensor_handle_interface",
-    hdrs = ["tensor_handle_interface.h"],
+    name = "abstract_tensor_handle",
+    hdrs = ["abstract_tensor_handle.h"],
+    visibility = [
+        "//tensorflow:internal",
+    ],
+    deps = [],
+)
+
+cc_library(
+    name = "immediate_execution_tensor_handle",
+    hdrs = ["immediate_execution_tensor_handle.h"],
     visibility = [
         "//tensorflow:internal",
     ],
     deps = [
+        ":abstract_tensor_handle",
         "//tensorflow/c:tensor_interface",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
@@ -177,13 +192,13 @@ cc_library(
 )
 
 cc_library(
-    name = "operation_interface",
-    hdrs = ["operation_interface.h"],
+    name = "abstract_operation",
+    hdrs = ["abstract_operation.h"],
     visibility = [
         "//tensorflow:internal",
     ],
     deps = [
-        ":tensor_handle_interface",
+        ":abstract_tensor_handle",
         "//tensorflow/c:tensor_interface",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
@@ -193,14 +208,58 @@ cc_library(
 )
 
 cc_library(
-    name = "context_interface",
-    hdrs = ["context_interface.h"],
+    name = "immediate_execution_operation",
+    hdrs = ["immediate_execution_operation.h"],
     visibility = [
         "//tensorflow:internal",
     ],
     deps = [
-        ":operation_interface",
-        ":tensor_handle_interface",
+        ":abstract_operation",
+        ":abstract_tensor_handle",
+        ":immediate_execution_tensor_handle",
+        "//tensorflow/c:tensor_interface",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+cc_library(
+    name = "abstract_context",
+    hdrs = ["abstract_context.h"],
+    visibility = [
+        "//tensorflow:internal",
+    ],
+    deps = [
+        ":abstract_function",
+        ":abstract_operation",
+    ],
+)
+
+cc_library(
+    name = "abstract_function",
+    hdrs = ["abstract_function.h"],
+    visibility = [
+        "//tensorflow:internal",
+    ],
+    deps = [
+        "//tensorflow/c:c_api",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/platform:status",
+    ],
+)
+
+cc_library(
+    name = "immediate_execution_context",
+    hdrs = ["immediate_execution_context.h"],
+    visibility = [
+        "//tensorflow:internal",
+    ],
+    deps = [
+        ":abstract_context",
+        ":immediate_execution_operation",
+        ":immediate_execution_tensor_handle",
         "//tensorflow/c:tensor_interface",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
@@ -217,7 +276,7 @@ cc_library(
         "//tensorflow:internal",
     ],
     deps = [
-        ":context_interface",
+        ":immediate_execution_context",
         "//tensorflow/c:conversion_macros",
     ],
 )
@@ -277,7 +336,7 @@ cc_library(
         "//tensorflow:internal",
     ],
     deps = [
-        ":operation_interface",
+        ":immediate_execution_operation",
         "//tensorflow/c:conversion_macros",
     ],
 )
@@ -300,7 +359,7 @@ cc_library(
         "//tensorflow:internal",
     ],
     deps = [
-        ":tensor_handle_interface",
+        ":immediate_execution_tensor_handle",
         "//tensorflow/c:conversion_macros",
     ],
 )
@@ -480,6 +539,9 @@ tf_cuda_library(
             ":tfe_context_internal",
             ":tfe_op_internal",
             ":tfe_tensorhandle_internal",
+            ":abstract_operation",
+            ":abstract_context",
+            ":abstract_tensor_handle",
             "//tensorflow/c:c_api",
             "//tensorflow/c:c_api_internal",
             "//tensorflow/core:core_cpu",
diff --git a/tensorflow/c/eager/abstract_context.h b/tensorflow/c/eager/abstract_context.h
new file mode 100644
index 00000000000..36d983e1408
--- /dev/null
+++ b/tensorflow/c/eager/abstract_context.h
@@ -0,0 +1,83 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_C_EAGER_ABSTRACT_CONTEXT_H_
+#define TENSORFLOW_C_EAGER_ABSTRACT_CONTEXT_H_
+
+#include <memory>
+#include <vector>
+
+#include "tensorflow/c/eager/abstract_function.h"
+#include "tensorflow/c/eager/abstract_operation.h"
+
+namespace tensorflow {
+
+// Abstract interface to a context.
+//
+// This serves as a factory for creating `AbstractOperation`s and for
+// registering traced functions.
+// Operations creation within a context can only be executed in that context
+// (for now at least).
+// Implementations of the context may contain some state e.g. an execution
+// environment, a traced representation etc.
+class AbstractContext {
+ protected:
+  enum AbstractContextKind { kTracing, kImmediateExecution };
+  explicit AbstractContext(AbstractContextKind kind) : kind_(kind) {}
+  virtual ~AbstractContext() {}
+
+ public:
+  AbstractContextKind getKind() const { return kind_; }
+
+  // Release any underlying resources, including the interface object.
+  //
+  // WARNING: The destructor of this class is marked as protected to disallow
+  // clients from directly destroying this object since it may manage it's own
+  // lifetime through ref counting. Thus clients MUST call Release() in order to
+  // destroy an instance of this class.
+  virtual void Release() = 0;
+
+  // Creates an operation builder and ties it to this context.
+  // The returned object can be used for setting operation's attributes,
+  // adding inputs and finally executing (immediately or lazily as in tracing)
+  // it in this context.
+  virtual AbstractOperation* CreateOperation() = 0;
+
+  // Registers a function with this context, after this the function is
+  // available to be called/referenced by its name in this context.
+  virtual Status RegisterFunction(AbstractFunction*) = 0;
+  // Remove a function. 'func' argument is the name of a previously added
+  // FunctionDef. The name is in fdef.signature.name.
+  virtual Status RemoveFunction(const string& func) = 0;
+
+ private:
+  const AbstractContextKind kind_;
+};
+
+namespace internal {
+struct AbstractContextDeleter {
+  void operator()(AbstractContext* p) const {
+    if (p != nullptr) {
+      p->Release();
+    }
+  }
+};
+}  // namespace internal
+
+using AbstractContextPtr =
+    std::unique_ptr<AbstractContext, internal::AbstractContextDeleter>;
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EAGER_ABSTRACT_CONTEXT_H_
diff --git a/tensorflow/c/eager/abstract_function.h b/tensorflow/c/eager/abstract_function.h
new file mode 100644
index 00000000000..e322b31f2b4
--- /dev/null
+++ b/tensorflow/c/eager/abstract_function.h
@@ -0,0 +1,46 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_C_EAGER_ABSTRACT_FUNCTION_H_
+#define TENSORFLOW_C_EAGER_ABSTRACT_FUNCTION_H_
+
+#include "tensorflow/c/c_api.h"
+#include "tensorflow/core/framework/function.pb.h"
+#include "tensorflow/core/platform/status.h"
+
+namespace tensorflow {
+
+// A traced function: this hides the complexity of converting the serialized
+// representation between various supported formats e.g. FunctionDef and Mlir
+// function.
+class AbstractFunction {
+ protected:
+  enum AbstractFunctionKind { kGraphFunc, kMlirFunc };
+  explicit AbstractFunction(AbstractFunctionKind kind) : kind_(kind) {}
+
+ public:
+  // Returns which subclass is this instance of.
+  AbstractFunctionKind getKind() const { return kind_; }
+  virtual ~AbstractFunction() = default;
+
+  // Returns the AbstractFunction as a FunctionDef.
+  virtual Status GetFunctionDef(FunctionDef**) = 0;
+
+ private:
+  const AbstractFunctionKind kind_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EAGER_ABSTRACT_FUNCTION_H_
diff --git a/tensorflow/c/eager/operation_interface.h b/tensorflow/c/eager/abstract_operation.h
similarity index 77%
rename from tensorflow/c/eager/operation_interface.h
rename to tensorflow/c/eager/abstract_operation.h
index 844ba6c14bd..817d7656ec8 100644
--- a/tensorflow/c/eager/operation_interface.h
+++ b/tensorflow/c/eager/abstract_operation.h
@@ -12,24 +12,31 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_C_EAGER_OPERATION_INTERFACE_H_
-#define TENSORFLOW_C_EAGER_OPERATION_INTERFACE_H_
+#ifndef TENSORFLOW_C_EAGER_ABSTRACT_OPERATION_H_
+#define TENSORFLOW_C_EAGER_ABSTRACT_OPERATION_H_
+
+#include <memory>
 
 #include "absl/types/span.h"
-#include "tensorflow/c/eager/tensor_handle_interface.h"
+#include "tensorflow/c/eager/abstract_tensor_handle.h"
 #include "tensorflow/c/tensor_interface.h"
-#include "tensorflow/core/framework/device_attributes.pb.h"
-#include "tensorflow/core/framework/op_def.pb.h"
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/platform/status.h"
 
-struct TFE_Op;
-
 namespace tensorflow {
 
 // Abstract interface to an operation.
-class AbstractOperationInterface {
+// This interface allows building and executing an operation in either
+// tracing or immediate execution mode.
+class AbstractOperation {
+ protected:
+  enum AbstractOperationKind { kTracing, kImmediateExecution };
+  explicit AbstractOperation(AbstractOperationKind kind) : kind_(kind) {}
+  virtual ~AbstractOperation() {}
+
  public:
+  AbstractOperationKind getKind() const { return kind_; }
+
   // Release any underlying resources, including the interface object.
   //
   // WARNING: The destructor of this class is marked as protected to disallow
@@ -38,7 +45,6 @@ class AbstractOperationInterface {
   // clients MUST call Release() in order to destroy an instance of this class.
   virtual void Release() = 0;
 
-  virtual void Clear() = 0;
   virtual Status Reset(const char* op, const char* raw_device_name) = 0;
 
   virtual const string& Name() const = 0;
@@ -66,12 +72,10 @@ class AbstractOperationInterface {
   // existing and given constraints will be performed.
   virtual Status SetDeviceName(const char* name) = 0;
 
-  virtual Status AddInput(AbstractTensorHandleInterface* input) = 0;
-  virtual Status AddInputList(
-      absl::Span<AbstractTensorHandleInterface*> inputs) = 0;
-  virtual Status Execute(absl::Span<AbstractTensorHandleInterface*> retvals,
+  virtual Status AddInput(AbstractTensorHandle* input) = 0;
+  virtual Status AddInputList(absl::Span<AbstractTensorHandle*> inputs) = 0;
+  virtual Status Execute(absl::Span<AbstractTensorHandle*> retvals,
                          int* num_retvals) = 0;
-  virtual const tensorflow::OpDef* OpDef() const = 0;
 
   virtual Status SetAttrString(const char* attr_name, const char* data,
                                size_t length) = 0;
@@ -82,7 +86,7 @@ class AbstractOperationInterface {
   virtual Status SetAttrShape(const char* attr_name, const int64_t* dims,
                               const int num_dims) = 0;
   virtual Status SetAttrFunction(const char* attr_name,
-                                 const AbstractOperationInterface* value) = 0;
+                                 const AbstractOperation* value) = 0;
   virtual Status SetAttrFunctionName(const char* attr_name, const char* value,
                                      size_t length) = 0;
   virtual Status SetAttrTensor(const char* attr_name,
@@ -102,19 +106,25 @@ class AbstractOperationInterface {
   virtual Status SetAttrShapeList(const char* attr_name, const int64_t** dims,
                                   const int* num_dims, int num_values) = 0;
   virtual Status SetAttrFunctionList(
-      const char* attr_name,
-      absl::Span<const AbstractOperationInterface*> values) = 0;
+      const char* attr_name, absl::Span<const AbstractOperation*> values) = 0;
 
-  virtual Status InputLength(const char* input_name, int* length) = 0;
-  virtual Status OutputLength(const char* output_name, int* length) = 0;
-
-  // Experimental
-  virtual Status SetUseXla(bool enable) = 0;
-
- protected:
-  virtual ~AbstractOperationInterface() {}
+ private:
+  const AbstractOperationKind kind_;
 };
 
+namespace internal {
+struct AbstractOperationDeleter {
+  void operator()(AbstractOperation* p) const {
+    if (p != nullptr) {
+      p->Release();
+    }
+  }
+};
+}  // namespace internal
+
+using AbstractOpPtr =
+    std::unique_ptr<AbstractOperation, internal::AbstractOperationDeleter>;
+
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_C_EAGER_OPERATION_INTERFACE_H_
+#endif  // TENSORFLOW_C_EAGER_ABSTRACT_OPERATION_H_
diff --git a/tensorflow/c/eager/abstract_tensor_handle.h b/tensorflow/c/eager/abstract_tensor_handle.h
new file mode 100644
index 00000000000..64b941d0729
--- /dev/null
+++ b/tensorflow/c/eager/abstract_tensor_handle.h
@@ -0,0 +1,61 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_C_EAGER_ABSTRACT_TENSOR_HANDLE_H_
+#define TENSORFLOW_C_EAGER_ABSTRACT_TENSOR_HANDLE_H_
+
+#include <memory>
+
+namespace tensorflow {
+
+// Abstract interface to a Tensor handle in either tracing or immediate
+// execution mode.
+class AbstractTensorHandle {
+ protected:
+  enum AbstractTensorHandleKind { kTracing, kImmediateExecution };
+  explicit AbstractTensorHandle(AbstractTensorHandleKind kind) : kind_(kind) {}
+  virtual ~AbstractTensorHandle() {}
+
+ public:
+  AbstractTensorHandleKind getKind() const { return kind_; }
+
+  // Release any underlying resources, including the interface object.
+  //
+  // WARNING: The destructor of this class is marked as protected to disallow
+  // clients from directly destroying this object since it may manage it's own
+  // lifetime through ref counting. Thus this must be allocated on the heap and
+  // clients MUST call Release() in order to destroy an instance of this class.
+  virtual void Release() = 0;
+
+ private:
+  const AbstractTensorHandleKind kind_;
+};
+
+namespace internal {
+struct AbstractTensorHandleDeleter {
+  void operator()(AbstractTensorHandle* p) const {
+    if (p != nullptr) {
+      p->Release();
+    }
+  }
+};
+}  // namespace internal
+
+using AbstractTensorHandlePtr =
+    std::unique_ptr<AbstractTensorHandle,
+                    internal::AbstractTensorHandleDeleter>;
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EAGER_ABSTRACT_TENSOR_HANDLE_H_
diff --git a/tensorflow/c/eager/c_api.cc b/tensorflow/c/eager/c_api.cc
index fdc91675f8b..4be3cdd7c2d 100644
--- a/tensorflow/c/eager/c_api.cc
+++ b/tensorflow/c/eager/c_api.cc
@@ -21,6 +21,8 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "tensorflow/c/eager/abstract_tensor_handle.h"
+
 // clang-format off
 #include "tensorflow/core/platform/platform.h"
 // clang-format on
@@ -31,8 +33,8 @@ limitations under the License.
 #include "tensorflow/c/c_api_internal.h"
 #include "tensorflow/c/eager/c_api_experimental.h"
 #include "tensorflow/c/eager/c_api_internal.h"
-#include "tensorflow/c/eager/operation_interface.h"
-#include "tensorflow/c/eager/tensor_handle_interface.h"
+#include "tensorflow/c/eager/immediate_execution_operation.h"
+#include "tensorflow/c/eager/immediate_execution_tensor_handle.h"
 #include "tensorflow/c/eager/tfe_context_internal.h"
 #include "tensorflow/c/eager/tfe_op_internal.h"
 #include "tensorflow/c/eager/tfe_tensorhandle_internal.h"
@@ -1119,7 +1121,7 @@ size_t TFE_TensorHandleDeviceMemorySize(TFE_TensorHandle* h,
 
 TFE_Op* TFE_NewOp(TFE_Context* ctx, const char* op_or_function_name,
                   TF_Status* status) {
-  tensorflow::AbstractOperationInterface* new_op =
+  tensorflow::ImmediateExecutionOperation* new_op =
       tensorflow::unwrap(ctx)->CreateOperation();
   status->status = new_op->Reset(op_or_function_name, nullptr);
   if (!status->status.ok()) {
@@ -1164,7 +1166,9 @@ void TFE_OpAddInput(TFE_Op* op, TFE_TensorHandle* input, TF_Status* status) {
 void TFE_OpAddInputList(TFE_Op* op, TFE_TensorHandle** inputs, int num_inputs,
                         TF_Status* status) {
   status->status = tensorflow::unwrap(op)->AddInputList(
-      {tensorflow::unwrap(inputs), static_cast<size_t>(num_inputs)});
+      {reinterpret_cast<tensorflow::AbstractTensorHandle**>(
+           tensorflow::unwrap(inputs)),
+       static_cast<size_t>(num_inputs)});
 }
 
 TF_AttrType TFE_OpGetAttrType(TFE_Op* op, const char* attr_name,
@@ -1324,7 +1328,9 @@ void TFE_OpSetAttrShapeList(TFE_Op* op, const char* attr_name,
 void TFE_OpSetAttrFunctionList(TFE_Op* op, const char* attr_name,
                                const TFE_Op** value, int num_values) {
   auto s = tensorflow::unwrap(op)->SetAttrFunctionList(
-      attr_name, {tensorflow::unwrap(value), static_cast<size_t>(num_values)});
+      attr_name, {reinterpret_cast<const tensorflow::AbstractOperation**>(
+                      tensorflow::unwrap(value)),
+                  static_cast<size_t>(num_values)});
   if (!s.ok()) {
     LOG(WARNING) << "Unable to set attribute: " << attr_name;
   }
@@ -1368,7 +1374,10 @@ TF_CAPI_EXPORT extern int TFE_OpGetOutputLength(TFE_Op* op,
 void TFE_Execute(TFE_Op* op, TFE_TensorHandle** retvals, int* num_retvals,
                  TF_Status* status) {
   status->status = tensorflow::unwrap(op)->Execute(
-      absl::MakeSpan(tensorflow::unwrap(retvals), *num_retvals), num_retvals);
+      absl::MakeSpan(reinterpret_cast<tensorflow::AbstractTensorHandle**>(
+                         tensorflow::unwrap(retvals)),
+                     *num_retvals),
+      num_retvals);
 }
 
 TFE_TensorHandle* TFE_TensorHandleCopyToDevice(TFE_TensorHandle* h,
diff --git a/tensorflow/c/eager/c_api_experimental.cc b/tensorflow/c/eager/c_api_experimental.cc
index 0d71b11531b..7390cf243be 100644
--- a/tensorflow/c/eager/c_api_experimental.cc
+++ b/tensorflow/c/eager/c_api_experimental.cc
@@ -38,7 +38,7 @@ using tensorflow::string;
 void TFE_OpReset(TFE_Op* op_to_reset, const char* op_or_function_name,
                  const char* raw_device_name, TF_Status* status) {
   if (op_to_reset) {
-    tensorflow::AbstractOperationInterface* op =
+    tensorflow::ImmediateExecutionOperation* op =
         tensorflow::unwrap(op_to_reset);
     op->Clear();
     status->status = op->Reset(op_or_function_name, raw_device_name);
@@ -60,6 +60,12 @@ void TFE_ContextDisableGraphCollection(TFE_Context* ctx) {
   context->SetShouldStoreGraphs(false);
 }
 
+uint64_t TFE_GetContextId(TFE_Context* ctx) {
+  tensorflow::EagerContext* context =
+      tensorflow::ContextFromInterface(tensorflow::unwrap(ctx));
+  return context->GetContextId();
+}
+
 void TFE_MonitoringCounterCellIncrementBy(TFE_MonitoringCounterCell* cell,
                                           int64_t value) {
   cell->cell.IncrementBy(value);
diff --git a/tensorflow/c/eager/c_api_experimental.h b/tensorflow/c/eager/c_api_experimental.h
index 1b8efe61ee0..1af76c01154 100644
--- a/tensorflow/c/eager/c_api_experimental.h
+++ b/tensorflow/c/eager/c_api_experimental.h
@@ -300,6 +300,14 @@ TF_CAPI_EXPORT extern void TFE_ContextOptionsSetLazyRemoteInputsCopy(
 TF_CAPI_EXPORT extern void TFE_ContextOptionsSetTfrt(TFE_ContextOptions*,
                                                      bool use_tfrt);
 
+// Returns the context_id from the EagerContext which is used by the
+// EagerService to maintain consistency between client and worker. The
+// context_id is initialized with a dummy value and is later set when the worker
+// is initialized (either locally or remotely). The context_id can change during
+// the process lifetime although this should cause the worker to be
+// reinitialized (e.g. cleared caches) as well.
+TF_CAPI_EXPORT extern uint64_t TFE_GetContextId(TFE_Context* ctx);
+
 // -----------------------------------------------------------------------------
 // Cancellation APIs.
 
diff --git a/tensorflow/c/eager/context_interface.h b/tensorflow/c/eager/immediate_execution_context.h
similarity index 77%
rename from tensorflow/c/eager/context_interface.h
rename to tensorflow/c/eager/immediate_execution_context.h
index e5a770a6826..77d59dd23e2 100644
--- a/tensorflow/c/eager/context_interface.h
+++ b/tensorflow/c/eager/immediate_execution_context.h
@@ -12,15 +12,17 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_C_EAGER_CONTEXT_INTERFACE_H_
-#define TENSORFLOW_C_EAGER_CONTEXT_INTERFACE_H_
+#ifndef TENSORFLOW_C_EAGER_IMMEDIATE_EXECUTION_CONTEXT_H_
+#define TENSORFLOW_C_EAGER_IMMEDIATE_EXECUTION_CONTEXT_H_
 
+#include <memory>
 #include <vector>
 
 #include "absl/types/optional.h"
 #include "absl/types/span.h"
-#include "tensorflow/c/eager/operation_interface.h"
-#include "tensorflow/c/eager/tensor_handle_interface.h"
+#include "tensorflow/c/eager/abstract_context.h"
+#include "tensorflow/c/eager/immediate_execution_operation.h"
+#include "tensorflow/c/eager/immediate_execution_tensor_handle.h"
 #include "tensorflow/c/tensor_interface.h"
 #include "tensorflow/core/framework/function.pb.h"
 #include "tensorflow/core/framework/numeric_types.h"
@@ -34,16 +36,9 @@ namespace tensorflow {
 //
 // A context is responsible for creating key objects such as Tensors,
 // TensorHandles & Operations.
-class AbstractContextInterface {
+class ImmediateExecutionContext : public AbstractContext {
  public:
-  // Release any underlying resources, including the interface object.
-  //
-  // WARNING: The destructor of this class is marked as protected to disallow
-  // clients from directly destroying this object since it may manage it's own
-  // lifetime through ref counting. Thus clients MUST call Release() in order to
-  // destroy an instance of this class.
-  virtual void Release() = 0;
-
+  static constexpr AbstractContextKind kKind = kImmediateExecution;
   // Optimized scalar creation functions
   virtual AbstractTensorInterface* CreateInt64Scalar(int64 value) = 0;
   virtual AbstractTensorInterface* CreateUint64Scalar(uint64 value) = 0;
@@ -74,15 +69,15 @@ class AbstractContextInterface {
                                                 void* memory_releaser_arg) = 0;
 
   // Create a handle to wrap and manage a Tensor
-  virtual AbstractTensorHandleInterface* CreateLocalHandle(
+  virtual ImmediateExecutionTensorHandle* CreateLocalHandle(
       AbstractTensorInterface* t) = 0;
   // Copy the handle to another device.
-  virtual AbstractTensorHandleInterface* CopyTensorHandleToDevice(
-      AbstractTensorHandleInterface* handle, const char* device_name,
+  virtual ImmediateExecutionTensorHandle* CopyTensorHandleToDevice(
+      ImmediateExecutionTensorHandle* handle, const char* device_name,
       Status* status) = 0;
 
   // Create an operation to perform op execution
-  virtual AbstractOperationInterface* CreateOperation() = 0;
+  ImmediateExecutionOperation* CreateOperation() override = 0;
 
   // Returns whether the runtime is backed by TFRT or the legacy TF Eager
   // Runtime. This is necessary to decouple runtime-dependent
@@ -107,14 +102,26 @@ class AbstractContextInterface {
   // be executed as an op. Return error if the function with the same name
   // already exists.
   virtual Status AddFunctionDef(const FunctionDef& fdef) = 0;
-  // Remove a function. 'func' argument is the name of a previously added
-  // FunctionDef. The name is in fdef.signature.name.
-  virtual Status RemoveFunction(const string& func) = 0;
 
  protected:
-  virtual ~AbstractContextInterface() {}
+  ImmediateExecutionContext() : AbstractContext(kKind) {}
+  ~ImmediateExecutionContext() override {}
 };
 
+namespace internal {
+struct ImmediateExecutionContextDeleter {
+  void operator()(ImmediateExecutionContext* p) const {
+    if (p != nullptr) {
+      p->Release();
+    }
+  }
+};
+}  // namespace internal
+
+using ImmediateContextPtr =
+    std::unique_ptr<ImmediateExecutionContext,
+                    internal::ImmediateExecutionContextDeleter>;
+
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_C_EAGER_CONTEXT_INTERFACE_H_
+#endif  // TENSORFLOW_C_EAGER_IMMEDIATE_EXECUTION_CONTEXT_H_
diff --git a/tensorflow/c/eager/immediate_execution_operation.h b/tensorflow/c/eager/immediate_execution_operation.h
new file mode 100644
index 00000000000..4e2959ba7af
--- /dev/null
+++ b/tensorflow/c/eager/immediate_execution_operation.h
@@ -0,0 +1,69 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_C_EAGER_IMMEDIATE_EXECUTION_OPERATION_H_
+#define TENSORFLOW_C_EAGER_IMMEDIATE_EXECUTION_OPERATION_H_
+
+#include <memory>
+
+#include "absl/types/span.h"
+#include "tensorflow/c/eager/abstract_operation.h"
+#include "tensorflow/c/eager/immediate_execution_tensor_handle.h"
+#include "tensorflow/c/tensor_interface.h"
+#include "tensorflow/core/framework/device_attributes.pb.h"
+#include "tensorflow/core/framework/op_def.pb.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/platform/casts.h"
+#include "tensorflow/core/platform/status.h"
+
+struct TFE_Op;
+
+namespace tensorflow {
+
+// Abstract interface to an operation.
+class ImmediateExecutionOperation : public AbstractOperation {
+ public:
+  static constexpr AbstractOperationKind kKind = kImmediateExecution;
+  virtual void Clear() = 0;
+
+  virtual const tensorflow::OpDef* OpDef() const = 0;
+
+  virtual Status InputLength(const char* input_name, int* length) = 0;
+  virtual Status OutputLength(const char* output_name, int* length) = 0;
+
+  // Experimental
+  virtual Status SetUseXla(bool enable) = 0;
+
+ protected:
+  ImmediateExecutionOperation() : AbstractOperation(kKind) {}
+  ~ImmediateExecutionOperation() override {}
+};
+
+namespace internal {
+struct ImmediateExecutionOperationDeleter {
+  void operator()(ImmediateExecutionOperation* p) const {
+    if (p != nullptr) {
+      p->Release();
+    }
+  }
+};
+}  // namespace internal
+
+using ImmediateOpPtr =
+    std::unique_ptr<ImmediateExecutionOperation,
+                    internal::ImmediateExecutionOperationDeleter>;
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EAGER_IMMEDIATE_EXECUTION_OPERATION_H_
diff --git a/tensorflow/c/eager/tensor_handle_interface.h b/tensorflow/c/eager/immediate_execution_tensor_handle.h
similarity index 69%
rename from tensorflow/c/eager/tensor_handle_interface.h
rename to tensorflow/c/eager/immediate_execution_tensor_handle.h
index 1ca40daec41..31aa3aa0f75 100644
--- a/tensorflow/c/eager/tensor_handle_interface.h
+++ b/tensorflow/c/eager/immediate_execution_tensor_handle.h
@@ -12,9 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_C_EAGER_TENSOR_HANDLE_INTERFACE_H_
-#define TENSORFLOW_C_EAGER_TENSOR_HANDLE_INTERFACE_H_
+#ifndef TENSORFLOW_C_EAGER_IMMEDIATE_EXECUTION_TENSOR_HANDLE_H_
+#define TENSORFLOW_C_EAGER_IMMEDIATE_EXECUTION_TENSOR_HANDLE_H_
 
+#include "tensorflow/c/eager/abstract_tensor_handle.h"
 #include "tensorflow/c/tensor_interface.h"
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/platform/status.h"
@@ -30,15 +31,9 @@ namespace tensorflow {
 // files. The interface lists the common functionality that must be provided by
 // any concrete implementation. However, in cases where the true concrete class
 // is needed a static_cast can be applied.
-class AbstractTensorHandleInterface {
+class ImmediateExecutionTensorHandle : public AbstractTensorHandle {
  public:
-  // Release any underlying resources, including the interface object.
-  //
-  // WARNING: The destructor of this class is marked as protected to disallow
-  // clients from directly destroying this object since it may manage it's own
-  // lifetime through ref counting. Thus this must be allocated on the heap and
-  // clients MUST call Release() in order to destroy an instance of this class.
-  virtual void Release() = 0;
+  static constexpr AbstractTensorHandleKind kKind = kImmediateExecution;
 
   // Returns tensor dtype.
   virtual tensorflow::DataType DataType() const = 0;
@@ -57,12 +52,27 @@ class AbstractTensorHandleInterface {
   virtual AbstractTensorInterface* Resolve(Status* status) = 0;
 
   // Return a copy of the handle.
-  virtual AbstractTensorHandleInterface* Copy() = 0;
+  virtual ImmediateExecutionTensorHandle* Copy() = 0;
 
  protected:
-  virtual ~AbstractTensorHandleInterface() {}
+  ImmediateExecutionTensorHandle() : AbstractTensorHandle(kKind) {}
+  ~ImmediateExecutionTensorHandle() override {}
 };
 
+namespace internal {
+struct ImmediateExecutionTensorHandleDeleter {
+  void operator()(ImmediateExecutionTensorHandle* p) const {
+    if (p != nullptr) {
+      p->Release();
+    }
+  }
+};
+}  // namespace internal
+
+using ImmediateTensorHandlePtr =
+    std::unique_ptr<ImmediateExecutionTensorHandle,
+                    internal::ImmediateExecutionTensorHandleDeleter>;
+
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_C_EAGER_TENSOR_HANDLE_INTERFACE_H_
+#endif  // TENSORFLOW_C_EAGER_IMMEDIATE_EXECUTION_TENSOR_HANDLE_H_
diff --git a/tensorflow/c/eager/parallel_device/parallel_device_lib.cc b/tensorflow/c/eager/parallel_device/parallel_device_lib.cc
index d0149b29c08..768f686bd88 100644
--- a/tensorflow/c/eager/parallel_device/parallel_device_lib.cc
+++ b/tensorflow/c/eager/parallel_device/parallel_device_lib.cc
@@ -262,14 +262,14 @@ std::unique_ptr<ParallelTensor> ParallelDevice::DeviceIDs(
   components.reserve(underlying_devices_.size());
   for (int device_index = 0; device_index < underlying_devices_.size();
        ++device_index) {
-    int64_t* device_id = new int64_t;
+    int32_t* device_id = new int32_t;
     *device_id = device_index;
     std::unique_ptr<TF_Tensor, decltype(&TF_DeleteTensor)> tensor(
         TF_NewTensor(
-            TF_INT64, /*dims=*/nullptr, /*num_dims=*/0, device_id,
-            sizeof(int64_t),
+            TF_INT32, /*dims=*/nullptr, /*num_dims=*/0, device_id,
+            sizeof(int32_t),
             [](void* data, size_t, void* arg) {
-              delete reinterpret_cast<int64_t*>(data);
+              delete reinterpret_cast<int32_t*>(data);
             },
             nullptr),
         TF_DeleteTensor);
@@ -283,7 +283,7 @@ std::unique_ptr<ParallelTensor> ParallelDevice::DeviceIDs(
     if (TF_GetCode(status) != TF_OK) return nullptr;
     TFE_OpSetAttrTensor(const_op.get(), "value", tensor.get(), status);
     if (TF_GetCode(status) != TF_OK) return nullptr;
-    TFE_OpSetAttrType(const_op.get(), "dtype", TF_INT64);
+    TFE_OpSetAttrType(const_op.get(), "dtype", TF_INT32);
     TFE_TensorHandle* device_handle;
     int num_outputs = 1;
     TFE_Execute(const_op.get(), &device_handle, &num_outputs, status);
diff --git a/tensorflow/c/eager/parallel_device/parallel_device_testlib.cc b/tensorflow/c/eager/parallel_device/parallel_device_testlib.cc
index fba47865c36..828dcbae093 100644
--- a/tensorflow/c/eager/parallel_device/parallel_device_testlib.cc
+++ b/tensorflow/c/eager/parallel_device/parallel_device_testlib.cc
@@ -296,8 +296,8 @@ void BasicTestsForTwoDevices(TFE_Context* context, const char* first_device,
     TFE_DeleteTensorHandle(result_handle);
     ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
 
-    ExpectScalarEq<int64_t>(components[0].get(), 0);
-    ExpectScalarEq<int64_t>(components[1].get(), 1);
+    ExpectScalarEq<int32_t>(components[0].get(), 0);
+    ExpectScalarEq<int32_t>(components[1].get(), 1);
     std::string first_device =
         TFE_TensorHandleBackingDeviceName(components[0].get(), status.get());
     ASSERT_EQ(underlying_devices[0], first_device);
diff --git a/tensorflow/c/eager/tfe_context_internal.h b/tensorflow/c/eager/tfe_context_internal.h
index 1d29bee9ee3..1f2035317fa 100644
--- a/tensorflow/c/eager/tfe_context_internal.h
+++ b/tensorflow/c/eager/tfe_context_internal.h
@@ -16,7 +16,7 @@ limitations under the License.
 #define TENSORFLOW_C_EAGER_TFE_CONTEXT_INTERNAL_H_
 
 #include "tensorflow/c/conversion_macros.h"
-#include "tensorflow/c/eager/context_interface.h"
+#include "tensorflow/c/eager/immediate_execution_context.h"
 
 // Wraps a pointer to a context implementation.
 //
@@ -28,7 +28,7 @@ typedef struct TFE_Context TFE_Context;
 
 namespace tensorflow {
 
-DEFINE_CONVERSION_FUNCTIONS(tensorflow::AbstractContextInterface, TFE_Context);
+DEFINE_CONVERSION_FUNCTIONS(tensorflow::ImmediateExecutionContext, TFE_Context);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/c/eager/tfe_op_internal.h b/tensorflow/c/eager/tfe_op_internal.h
index 6ca7f741d16..3fe94d358b6 100644
--- a/tensorflow/c/eager/tfe_op_internal.h
+++ b/tensorflow/c/eager/tfe_op_internal.h
@@ -16,7 +16,7 @@ limitations under the License.
 #define TENSORFLOW_C_EAGER_TFE_OP_INTERNAL_H_
 
 #include "tensorflow/c/conversion_macros.h"
-#include "tensorflow/c/eager/operation_interface.h"
+#include "tensorflow/c/eager/immediate_execution_operation.h"
 
 // Wraps a pointer to an operation implementation.
 //
@@ -28,8 +28,8 @@ typedef struct TFE_Op TFE_Op;
 
 namespace tensorflow {
 
-DEFINE_CONVERSION_FUNCTIONS(tensorflow::AbstractOperationInterface, TFE_Op);
-DEFINE_CONVERSION_FUNCTIONS(tensorflow::AbstractOperationInterface*, TFE_Op*);
+DEFINE_CONVERSION_FUNCTIONS(tensorflow::ImmediateExecutionOperation, TFE_Op);
+DEFINE_CONVERSION_FUNCTIONS(tensorflow::ImmediateExecutionOperation*, TFE_Op*);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/c/eager/tfe_tensorhandle_internal.h b/tensorflow/c/eager/tfe_tensorhandle_internal.h
index 543e5f1d932..308e8c24e2c 100644
--- a/tensorflow/c/eager/tfe_tensorhandle_internal.h
+++ b/tensorflow/c/eager/tfe_tensorhandle_internal.h
@@ -16,7 +16,7 @@ limitations under the License.
 #define TENSORFLOW_C_EAGER_TFE_TENSORHANDLE_INTERNAL_H_
 
 #include "tensorflow/c/conversion_macros.h"
-#include "tensorflow/c/eager/tensor_handle_interface.h"
+#include "tensorflow/c/eager/immediate_execution_tensor_handle.h"
 
 // Wraps a pointer to a tensor handle implementation.
 //
@@ -28,9 +28,9 @@ typedef struct TFE_TensorHandle TFE_TensorHandle;
 
 namespace tensorflow {
 
-DEFINE_CONVERSION_FUNCTIONS(tensorflow::AbstractTensorHandleInterface,
+DEFINE_CONVERSION_FUNCTIONS(tensorflow::ImmediateExecutionTensorHandle,
                             TFE_TensorHandle);
-DEFINE_CONVERSION_FUNCTIONS(tensorflow::AbstractTensorHandleInterface*,
+DEFINE_CONVERSION_FUNCTIONS(tensorflow::ImmediateExecutionTensorHandle*,
                             TFE_TensorHandle*);
 
 }  // namespace tensorflow
diff --git a/tensorflow/c/env.cc b/tensorflow/c/env.cc
index 1c35ff9001d..ce715c43acb 100644
--- a/tensorflow/c/env.cc
+++ b/tensorflow/c/env.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "tensorflow/c/c_api_internal.h"
 #include "tensorflow/c/tf_status_helper.h"
 #include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/path.h"
 #include "tensorflow/core/platform/types.h"
 
 struct TF_StringStream {
@@ -146,6 +147,10 @@ TF_StringStream* TF_GetLocalTempDirectories() {
   return list;
 }
 
+char* TF_GetTempFileName(const char* extension) {
+  return strdup(::tensorflow::io::GetTempFilename(extension).c_str());
+}
+
 TF_CAPI_EXPORT extern uint64_t TF_NowNanos(void) {
   return ::tensorflow::Env::Default()->NowNanos();
 }
diff --git a/tensorflow/c/env.h b/tensorflow/c/env.h
index 2a763730bc3..7dc7ac32f08 100644
--- a/tensorflow/c/env.h
+++ b/tensorflow/c/env.h
@@ -152,6 +152,10 @@ TF_CAPI_EXPORT extern TF_StringStream* TF_GetChildren(const char* filename,
 // The caller is responsible for freeing the list (see TF_StringStreamDone).
 TF_CAPI_EXPORT extern TF_StringStream* TF_GetLocalTempDirectories(void);
 
+// Creates a temporary file name with an extension.
+// The caller is responsible for freeing the returned pointer.
+TF_CAPI_EXPORT extern char* TF_GetTempFileName(const char* extension);
+
 // Returns the number of nanoseconds since the Unix epoch.
 TF_CAPI_EXPORT extern uint64_t TF_NowNanos(void);
 
diff --git a/tensorflow/c/experimental/filesystem/plugins/gcs/BUILD b/tensorflow/c/experimental/filesystem/plugins/gcs/BUILD
index c9fee433589..f61aa8347d4 100644
--- a/tensorflow/c/experimental/filesystem/plugins/gcs/BUILD
+++ b/tensorflow/c/experimental/filesystem/plugins/gcs/BUILD
@@ -1,5 +1,5 @@
 # Experimental gcs filesystem plugin.
-load("//tensorflow:tensorflow.bzl", "get_win_copts", "tf_cc_shared_object")
+load("//tensorflow:tensorflow.bzl", "get_win_copts", "tf_cc_shared_object", "tf_cc_test")
 
 package(
     licenses = ["notice"],  # Apache 2.0
@@ -24,9 +24,45 @@ cc_library(
         "//tensorflow:windows": get_win_copts(),
     }),
     deps = [
+        ":gcs_helper",
+        "//tensorflow/c:env",
         "//tensorflow/c:tf_status",
         "//tensorflow/c/experimental/filesystem:filesystem_interface",
         "@com_github_googlecloudplatform_google_cloud_cpp//:storage_client",
         "@com_google_absl//absl/strings",
     ],
 )
+
+cc_library(
+    name = "gcs_helper",
+    srcs = ["gcs_helper.cc"],
+    hdrs = ["gcs_helper.h"],
+    linkstatic = 1,
+    deps = [
+        "//tensorflow/c:env",
+    ],
+)
+
+tf_cc_test(
+    name = "gcs_filesystem_test",
+    srcs = [
+        "gcs_filesystem.cc",
+        "gcs_filesystem_test.cc",
+    ],
+    local_defines = ["TF_GCS_FILESYSTEM_TEST"],
+    tags = [
+        "manual",
+        "notap",
+    ],
+    deps = [
+        ":gcs_helper",
+        "//tensorflow/c:env",
+        "//tensorflow/c:tf_status",
+        "//tensorflow/c:tf_status_helper",
+        "//tensorflow/c/experimental/filesystem:filesystem_interface",
+        "//tensorflow/core/platform:stacktrace_handler",
+        "//tensorflow/core/platform:test",
+        "@com_github_googlecloudplatform_google_cloud_cpp//:storage_client",
+        "@com_google_absl//absl/strings",
+    ],
+)
diff --git a/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem.cc b/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem.cc
index 8c54bc85439..8c5c035f939 100644
--- a/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem.cc
+++ b/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem.cc
@@ -15,11 +15,23 @@ limitations under the License.
 #include <stdlib.h>
 #include <string.h>
 
+#include <fstream>
+
 #include "absl/strings/string_view.h"
 #include "google/cloud/storage/client.h"
+#include "tensorflow/c/env.h"
 #include "tensorflow/c/experimental/filesystem/filesystem_interface.h"
+#include "tensorflow/c/experimental/filesystem/plugins/gcs/gcs_helper.h"
 #include "tensorflow/c/tf_status.h"
 
+#ifdef TF_GCS_FILESYSTEM_TEST
+// For testing purpose, we expose some functions.
+#define TF_STATIC
+#else
+// Otherwise, we don't expose any symbol.
+#define TF_STATIC static
+#endif
+
 // Implementation of a filesystem for GCS environments.
 // This filesystem will support `gs://` URI schemes.
 namespace gcs = google::cloud::storage;
@@ -86,6 +98,20 @@ namespace tf_random_access_file {
 // SECTION 2. Implementation for `TF_WritableFile`
 // ----------------------------------------------------------------------------
 namespace tf_writable_file {
+typedef struct GCSFile {
+  const char* bucket;
+  const char* object;
+  gcs::Client* gcs_client;  // not owned
+  TempFile outfile;
+  bool sync_need;
+} GCSFile;
+
+static void Cleanup(TF_WritableFile* file) {
+  auto gcs_file = static_cast<GCSFile*>(file->plugin_file);
+  plugin_memory_free(const_cast<char*>(gcs_file->bucket));
+  plugin_memory_free(const_cast<char*>(gcs_file->object));
+  delete gcs_file;
+}
 
 // TODO(vnvo2409): Implement later
 
@@ -104,7 +130,7 @@ namespace tf_read_only_memory_region {
 namespace tf_gcs_filesystem {
 
 // TODO(vnvo2409): Add lazy-loading and customizing parameters.
-static void Init(TF_Filesystem* filesystem, TF_Status* status) {
+TF_STATIC void Init(TF_Filesystem* filesystem, TF_Status* status) {
   google::cloud::StatusOr<gcs::Client> client =
       gcs::Client::CreateDefaultClient();
   if (!client) {
@@ -117,8 +143,54 @@ static void Init(TF_Filesystem* filesystem, TF_Status* status) {
   TF_SetStatus(status, TF_OK, "");
 }
 
+static void Cleanup(TF_Filesystem* filesystem) {
+  plugin_memory_free(filesystem->plugin_filesystem);
+}
+
 // TODO(vnvo2409): Implement later
 
+static void NewWritableFile(const TF_Filesystem* filesystem, const char* path,
+                            TF_WritableFile* file, TF_Status* status) {
+  char* bucket;
+  char* object;
+  ParseGCSPath(path, false, &bucket, &object, status);
+  if (TF_GetCode(status) != TF_OK) return;
+
+  auto gcs_client = static_cast<gcs::Client*>(filesystem->plugin_filesystem);
+  char* temp_file_name = TF_GetTempFileName("");
+  file->plugin_file = new tf_writable_file::GCSFile(
+      {bucket, object, gcs_client,
+       TempFile(temp_file_name, std::ios::binary | std::ios::out), true});
+  // We are responsible for freeing the pointer returned by TF_GetTempFileName
+  free(temp_file_name);
+  TF_SetStatus(status, TF_OK, "");
+}
+
+static void NewAppendableFile(const TF_Filesystem* filesystem, const char* path,
+                              TF_WritableFile* file, TF_Status* status) {
+  char* bucket;
+  char* object;
+  ParseGCSPath(path, false, &bucket, &object, status);
+  if (TF_GetCode(status) != TF_OK) return;
+
+  auto gcs_client = static_cast<gcs::Client*>(filesystem->plugin_filesystem);
+  char* temp_file_name = TF_GetTempFileName("");
+
+  auto gcs_status = gcs_client->DownloadToFile(bucket, object, temp_file_name);
+  TF_SetStatusFromGCSStatus(gcs_status, status);
+  auto status_code = TF_GetCode(status);
+  if (status_code != TF_OK && status_code != TF_NOT_FOUND) {
+    return;
+  }
+  // If this file does not exist on server, we will need to sync it.
+  bool sync_need = (status_code == TF_NOT_FOUND);
+  file->plugin_file = new tf_writable_file::GCSFile(
+      {bucket, object, gcs_client,
+       TempFile(temp_file_name, std::ios::binary | std::ios::app), sync_need});
+  free(temp_file_name);
+  TF_SetStatus(status, TF_OK, "");
+}
+
 }  // namespace tf_gcs_filesystem
 
 static void ProvideFilesystemSupportFor(TF_FilesystemPluginOps* ops,
@@ -126,9 +198,17 @@ static void ProvideFilesystemSupportFor(TF_FilesystemPluginOps* ops,
   TF_SetFilesystemVersionMetadata(ops);
   ops->scheme = strdup(uri);
 
+  ops->writable_file_ops = static_cast<TF_WritableFileOps*>(
+      plugin_memory_allocate(TF_WRITABLE_FILE_OPS_SIZE));
+  ops->writable_file_ops->cleanup = tf_writable_file::Cleanup;
+
   ops->filesystem_ops = static_cast<TF_FilesystemOps*>(
       plugin_memory_allocate(TF_FILESYSTEM_OPS_SIZE));
   ops->filesystem_ops->init = tf_gcs_filesystem::Init;
+  ops->filesystem_ops->cleanup = tf_gcs_filesystem::Cleanup;
+  ops->filesystem_ops->new_writable_file = tf_gcs_filesystem::NewWritableFile;
+  ops->filesystem_ops->new_appendable_file =
+      tf_gcs_filesystem::NewAppendableFile;
 }
 
 void TF_InitPlugin(TF_FilesystemPluginInfo* info) {
diff --git a/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem_test.cc b/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem_test.cc
new file mode 100644
index 00000000000..43221763791
--- /dev/null
+++ b/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem_test.cc
@@ -0,0 +1,61 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/c/experimental/filesystem/filesystem_interface.h"
+#include "tensorflow/c/tf_status_helper.h"
+#include "tensorflow/core/platform/stacktrace_handler.h"
+#include "tensorflow/core/platform/test.h"
+
+#define ASSERT_TF_OK(x) ASSERT_EQ(TF_OK, TF_GetCode(x))
+
+// Forward declaration
+namespace tf_gcs_filesystem {
+void Init(TF_Filesystem* filesystem, TF_Status* status);
+}
+
+namespace tensorflow {
+namespace {
+
+class GCSFilesystemTest : public ::testing::Test {
+ public:
+  void SetUp() override {
+    status_ = TF_NewStatus();
+    filesystem_ = new TF_Filesystem;
+    tf_gcs_filesystem::Init(filesystem_, status_);
+    ASSERT_TF_OK(status_) << "Can not initialize filesystem. "
+                          << TF_Message(status_);
+  }
+  void TearDown() override {
+    TF_DeleteStatus(status_);
+    // TODO(vnvo2409): Add filesystem cleanup
+    delete filesystem_;
+  }
+
+ protected:
+  TF_Filesystem* filesystem_;
+  TF_Status* status_;
+};
+
+// We have to add this test here because there must be at least one test.
+// This test will be removed in the future.
+TEST_F(GCSFilesystemTest, TestInit) { ASSERT_TF_OK(status_); }
+
+}  // namespace
+}  // namespace tensorflow
+
+GTEST_API_ int main(int argc, char** argv) {
+  tensorflow::testing::InstallStacktraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_helper.cc b/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_helper.cc
new file mode 100644
index 00000000000..4504a9f3b35
--- /dev/null
+++ b/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_helper.cc
@@ -0,0 +1,34 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/c/experimental/filesystem/plugins/gcs/gcs_helper.h"
+
+#include <stdio.h>
+
+#include <fstream>
+#include <string>
+#include <utility>
+
+TempFile::TempFile(const char* temp_file_name, std::ios::openmode mode)
+    : std::fstream(temp_file_name, mode), name_(temp_file_name) {}
+
+TempFile::TempFile(TempFile&& rhs)
+    : std::fstream(std::move(rhs)), name_(std::move(rhs.name_)) {}
+
+TempFile::~TempFile() {
+  std::fstream::close();
+  std::remove(name_.c_str());
+}
+
+const std::string TempFile::getName() const { return name_; }
diff --git a/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_helper.h b/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_helper.h
new file mode 100644
index 00000000000..1a521ca4f1e
--- /dev/null
+++ b/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_helper.h
@@ -0,0 +1,33 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_C_EXPERIMENTAL_FILESYSTEM_PLUGINS_GCS_GCS_HELPER_H_
+#define TENSORFLOW_C_EXPERIMENTAL_FILESYSTEM_PLUGINS_GCS_GCS_HELPER_H_
+
+#include <fstream>
+#include <string>
+
+class TempFile : public std::fstream {
+ public:
+  // We should specify openmode each time we call TempFile.
+  TempFile(const char* temp_file_name, std::ios::openmode mode);
+  TempFile(TempFile&& rhs);
+  ~TempFile() override;
+  const std::string getName() const;
+
+ private:
+  const std::string name_;
+};
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_FILESYSTEM_PLUGINS_GCS_GCS_HELPER_H_
diff --git a/tensorflow/c/experimental/saved_model/core/BUILD b/tensorflow/c/experimental/saved_model/core/BUILD
index 2e817ed02e0..dbe1b6d656c 100644
--- a/tensorflow/c/experimental/saved_model/core/BUILD
+++ b/tensorflow/c/experimental/saved_model/core/BUILD
@@ -23,8 +23,8 @@ cc_library(
     ],
     deps = [
         ":function_metadata",
-        "//tensorflow/c/eager:operation_interface",
-        "//tensorflow/c/eager:tensor_handle_interface",
+        "//tensorflow/c/eager:immediate_execution_operation",
+        "//tensorflow/c/eager:immediate_execution_tensor_handle",
         "//tensorflow/core:protos_all_cc",
     ],
 )
diff --git a/tensorflow/c/experimental/saved_model/core/concrete_function.cc b/tensorflow/c/experimental/saved_model/core/concrete_function.cc
index d5da2ca9bf4..41bae4352fc 100644
--- a/tensorflow/c/experimental/saved_model/core/concrete_function.cc
+++ b/tensorflow/c/experimental/saved_model/core/concrete_function.cc
@@ -15,12 +15,12 @@ limitations under the License.
 
 #include "tensorflow/c/experimental/saved_model/core/concrete_function.h"
 
-#include "tensorflow/c/eager/tensor_handle_interface.h"
+#include "tensorflow/c/eager/immediate_execution_tensor_handle.h"
 #include "tensorflow/c/experimental/saved_model/core/function_metadata.h"
 
 namespace tensorflow {
 
-const std::vector<tensorflow::AbstractTensorHandleInterface*>&
+const std::vector<tensorflow::ImmediateExecutionTensorHandle*>&
 ConcreteFunction::GetCaptures() const {
   return captures_;
 }
diff --git a/tensorflow/c/experimental/saved_model/core/concrete_function.h b/tensorflow/c/experimental/saved_model/core/concrete_function.h
index 6f8a5375277..22535641ef5 100644
--- a/tensorflow/c/experimental/saved_model/core/concrete_function.h
+++ b/tensorflow/c/experimental/saved_model/core/concrete_function.h
@@ -18,8 +18,8 @@ limitations under the License.
 
 #include <vector>
 
-#include "tensorflow/c/eager/operation_interface.h"
-#include "tensorflow/c/eager/tensor_handle_interface.h"
+#include "tensorflow/c/eager/immediate_execution_operation.h"
+#include "tensorflow/c/eager/immediate_execution_tensor_handle.h"
 #include "tensorflow/c/experimental/saved_model/core/function_metadata.h"
 #include "tensorflow/core/framework/function.pb.h"
 
@@ -38,15 +38,15 @@ class ConcreteFunction {
   virtual ~ConcreteFunction() = 0;
 
   // This method returns the "Call" Op used to execute the function.
-  virtual AbstractOperationInterface* GetCallOp() = 0;
+  virtual ImmediateExecutionOperation* GetCallOp() = 0;
 
-  const std::vector<tensorflow::AbstractTensorHandleInterface*>& GetCaptures()
+  const std::vector<tensorflow::ImmediateExecutionTensorHandle*>& GetCaptures()
       const;
   const FunctionMetadata& GetFunctionMetadata() const;
 
  private:
   FunctionMetadata metadata_;
-  std::vector<tensorflow::AbstractTensorHandleInterface*> captures_;
+  std::vector<tensorflow::ImmediateExecutionTensorHandle*> captures_;
   FunctionDef* function_;
 };
 
diff --git a/tensorflow/c/experimental/saved_model/core/ops/BUILD b/tensorflow/c/experimental/saved_model/core/ops/BUILD
index b42e93c3716..1e2496487f9 100644
--- a/tensorflow/c/experimental/saved_model/core/ops/BUILD
+++ b/tensorflow/c/experimental/saved_model/core/ops/BUILD
@@ -14,44 +14,6 @@ package(
     licenses = ["notice"],  # Apache 2.0
 )
 
-cc_library(
-    name = "owned_eager_op",
-    hdrs = [
-        "owned_eager_op.h",
-    ],
-    deps = [
-        "//tensorflow/c/eager:operation_interface",
-    ],
-)
-
-cc_library(
-    name = "owned_tensor_handle",
-    hdrs = [
-        "owned_tensor_handle.h",
-    ],
-    deps = [
-        "//tensorflow/c/eager:tensor_handle_interface",
-        "//tensorflow/core/common_runtime/eager:tensor_handle",
-    ],
-)
-
-cc_library(
-    name = "owned_eager_context",
-    hdrs = ["owned_eager_context.h"],
-    deps = [
-        "//tensorflow/c/eager:context_interface",
-        "//tensorflow/core/common_runtime/eager:context",
-    ],
-)
-
-cc_library(
-    name = "owned_tensor",
-    hdrs = ["owned_tensor.h"],
-    deps = [
-        "//tensorflow/c:tensor_interface",
-    ],
-)
-
 cc_library(
     name = "variable_ops",
     srcs = [
@@ -61,10 +23,11 @@ cc_library(
         "variable_ops.h",
     ],
     deps = [
-        ":owned_eager_op",
-        ":owned_tensor_handle",
-        "//tensorflow/c/eager:context_interface",
-        "//tensorflow/c/eager:tensor_handle_interface",
+        "//tensorflow/c/eager:abstract_operation",
+        "//tensorflow/c/eager:abstract_tensor_handle",
+        "//tensorflow/c/eager:immediate_execution_context",
+        "//tensorflow/c/eager:immediate_execution_operation",
+        "//tensorflow/c/eager:immediate_execution_tensor_handle",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
@@ -78,10 +41,11 @@ tf_cc_test(
         "variable_ops_test.cc",
     ],
     deps = [
-        ":owned_eager_context",
-        ":owned_tensor",
-        ":owned_tensor_handle",
         ":variable_ops",
+        "//tensorflow/c:tensor_interface",
+        "//tensorflow/c/eager:abstract_tensor_handle",
+        "//tensorflow/c/eager:immediate_execution_context",
+        "//tensorflow/c/eager:immediate_execution_tensor_handle",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
diff --git a/tensorflow/c/experimental/saved_model/core/ops/owned_eager_context.h b/tensorflow/c/experimental/saved_model/core/ops/owned_eager_context.h
deleted file mode 100644
index 300059cd069..00000000000
--- a/tensorflow/c/experimental/saved_model/core/ops/owned_eager_context.h
+++ /dev/null
@@ -1,54 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_OPS_OWNED_EAGER_CONTEXT_H_
-#define TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_OPS_OWNED_EAGER_CONTEXT_H_
-
-#include <memory>
-
-#include "tensorflow/c/eager/context_interface.h"
-#include "tensorflow/core/common_runtime/eager/context.h"
-
-namespace tensorflow {
-namespace internal {
-
-struct AbstractContextInterfaceDeleter {
-  void operator()(AbstractContextInterface* p) const {
-    if (p != nullptr) {
-      p->Release();
-    }
-  }
-};
-
-struct EagerContextDeleter {
-  void operator()(EagerContext* p) const {
-    if (p != nullptr) {
-      p->Release();
-    }
-  }
-};
-
-}  // namespace internal
-
-using AbstractContextPtr =
-    std::unique_ptr<AbstractContextInterface,
-                    internal::AbstractContextInterfaceDeleter>;
-
-using EagerContextPtr =
-    std::unique_ptr<EagerContext, internal::EagerContextDeleter>;
-
-}  // namespace tensorflow
-
-#endif  // THIRD_PARTY_TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_OPS_OWNED_EAGER_CONTEXT_H_
diff --git a/tensorflow/c/experimental/saved_model/core/ops/owned_tensor_handle.h b/tensorflow/c/experimental/saved_model/core/ops/owned_tensor_handle.h
deleted file mode 100644
index e98d6554afb..00000000000
--- a/tensorflow/c/experimental/saved_model/core/ops/owned_tensor_handle.h
+++ /dev/null
@@ -1,54 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_OWNED_TENSOR_HANDLE_H_
-#define TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_OWNED_TENSOR_HANDLE_H_
-
-#include <memory>
-
-#include "tensorflow/c/eager/tensor_handle_interface.h"
-#include "tensorflow/core/common_runtime/eager/tensor_handle.h"
-
-namespace tensorflow {
-namespace internal {
-
-struct TensorHandleDeleter {
-  void operator()(TensorHandle* p) const {
-    if (p != nullptr) {
-      p->Release();
-    }
-  }
-};
-
-struct AbstractTensorHandleDeleter {
-  void operator()(AbstractTensorHandleInterface* p) const {
-    if (p != nullptr) {
-      p->Release();
-    }
-  }
-};
-
-}  // namespace internal
-
-using TensorHandlePtr =
-    std::unique_ptr<TensorHandle, internal::TensorHandleDeleter>;
-
-using AbstractTensorHandlePtr =
-    std::unique_ptr<AbstractTensorHandleInterface,
-                    internal::AbstractTensorHandleDeleter>;
-
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_OWNED_TENSOR_HANDLE_H_
diff --git a/tensorflow/c/experimental/saved_model/core/ops/variable_ops.cc b/tensorflow/c/experimental/saved_model/core/ops/variable_ops.cc
index a3b3ace7be9..67c592fc16b 100644
--- a/tensorflow/c/experimental/saved_model/core/ops/variable_ops.cc
+++ b/tensorflow/c/experimental/saved_model/core/ops/variable_ops.cc
@@ -16,9 +16,11 @@ limitations under the License.
 #include "tensorflow/c/experimental/saved_model/core/ops/variable_ops.h"
 
 #include "absl/types/span.h"
-#include "tensorflow/c/eager/context_interface.h"
-#include "tensorflow/c/experimental/saved_model/core/ops/owned_eager_op.h"
-#include "tensorflow/c/experimental/saved_model/core/ops/owned_tensor_handle.h"
+#include "tensorflow/c/eager/abstract_operation.h"
+#include "tensorflow/c/eager/abstract_tensor_handle.h"
+#include "tensorflow/c/eager/immediate_execution_context.h"
+#include "tensorflow/c/eager/immediate_execution_operation.h"
+#include "tensorflow/c/eager/immediate_execution_tensor_handle.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
@@ -32,10 +34,10 @@ namespace internal {
 static const char kNoSharingResourceID[] =
     "cd2c89b7-88b7-44c8-ad83-06c2a9158347";
 
-Status CreateUninitializedResourceVariable(AbstractContextInterface* ctx,
+Status CreateUninitializedResourceVariable(ImmediateExecutionContext* ctx,
                                            DataType dtype, TensorShape shape,
-                                           AbstractTensorHandlePtr* handle) {
-  AbstractOpPtr varhandle_op = AbstractOpPtr(ctx->CreateOperation());
+                                           ImmediateTensorHandlePtr* handle) {
+  ImmediateOpPtr varhandle_op(ctx->CreateOperation());
 
   TF_RETURN_IF_ERROR(varhandle_op->Reset("VarHandleOp", nullptr));
   TF_RETURN_IF_ERROR(varhandle_op->SetAttrType("dtype", dtype));
@@ -50,18 +52,23 @@ Status CreateUninitializedResourceVariable(AbstractContextInterface* ctx,
   TF_RETURN_IF_ERROR(varhandle_op->SetAttrString(
       "shared_name", kNoSharingResourceID, strlen(kNoSharingResourceID)));
 
-  AbstractTensorHandleInterface* var_handle = nullptr;
+  AbstractTensorHandle* var_handle = nullptr;
   int num_retvals = 1;
   TF_RETURN_IF_ERROR(varhandle_op->Execute(
       absl::MakeSpan(&var_handle, num_retvals), &num_retvals));
-  handle->reset(var_handle);
+  AbstractTensorHandlePtr owned_var_handle(var_handle);
+  if (owned_var_handle->getKind() != ImmediateExecutionTensorHandle::kKind) {
+    return errors::Internal("Unexpected tensor handle kind.");
+  }
+  handle->reset(reinterpret_cast<ImmediateExecutionTensorHandle*>(
+      owned_var_handle.release()));
   return Status();
 }
 
-Status AssignVariable(AbstractContextInterface* ctx,
-                      AbstractTensorHandleInterface* variable_handle,
-                      DataType dtype, AbstractTensorHandleInterface* value) {
-  AbstractOpPtr assign_op(ctx->CreateOperation());
+Status AssignVariable(ImmediateExecutionContext* ctx,
+                      ImmediateExecutionTensorHandle* variable_handle,
+                      DataType dtype, ImmediateExecutionTensorHandle* value) {
+  ImmediateOpPtr assign_op(ctx->CreateOperation());
   TF_RETURN_IF_ERROR(assign_op->Reset("AssignVariableOp", nullptr));
   TF_RETURN_IF_ERROR(assign_op->SetAttrType("dtype", dtype));
   TF_RETURN_IF_ERROR(assign_op->AddInput(variable_handle));
@@ -72,25 +79,30 @@ Status AssignVariable(AbstractContextInterface* ctx,
   return Status();
 }
 
-Status ReadVariable(AbstractContextInterface* ctx,
-                    AbstractTensorHandleInterface* variable_handle,
-                    DataType dtype, AbstractTensorHandlePtr* output) {
-  AbstractOpPtr read_op = AbstractOpPtr(ctx->CreateOperation());
+Status ReadVariable(ImmediateExecutionContext* ctx,
+                    ImmediateExecutionTensorHandle* variable_handle,
+                    DataType dtype, ImmediateTensorHandlePtr* output) {
+  ImmediateOpPtr read_op(ctx->CreateOperation());
   TF_RETURN_IF_ERROR(read_op->Reset("ReadVariableOp", nullptr));
   TF_RETURN_IF_ERROR(read_op->SetAttrType("dtype", dtype));
   TF_RETURN_IF_ERROR(read_op->AddInput(variable_handle));
 
-  AbstractTensorHandleInterface* value = nullptr;
+  AbstractTensorHandle* value = nullptr;
   int num_retvals = 1;
   TF_RETURN_IF_ERROR(
       read_op->Execute(absl::MakeSpan(&value, num_retvals), &num_retvals));
-  output->reset(value);
+  AbstractTensorHandlePtr owned_value(value);
+  if (owned_value->getKind() != ImmediateExecutionTensorHandle::kKind) {
+    return errors::Internal("Unexpected tensor handle kind.");
+  }
+  output->reset(
+      reinterpret_cast<ImmediateExecutionTensorHandle*>(owned_value.release()));
   return Status();
 }
 
-Status DestroyResource(AbstractContextInterface* ctx,
-                       AbstractTensorHandleInterface* handle) {
-  AbstractOpPtr destroy_op = AbstractOpPtr(ctx->CreateOperation());
+Status DestroyResource(ImmediateExecutionContext* ctx,
+                       ImmediateExecutionTensorHandle* handle) {
+  ImmediateOpPtr destroy_op(ctx->CreateOperation());
   TF_RETURN_IF_ERROR(destroy_op->Reset("DestroyResourceOp", nullptr));
   TF_RETURN_IF_ERROR(destroy_op->SetAttrBool("ignore_lookup_error", true));
   TF_RETURN_IF_ERROR(destroy_op->AddInput(handle));
diff --git a/tensorflow/c/experimental/saved_model/core/ops/variable_ops.h b/tensorflow/c/experimental/saved_model/core/ops/variable_ops.h
index 8a410328b9e..13c941a77fe 100644
--- a/tensorflow/c/experimental/saved_model/core/ops/variable_ops.h
+++ b/tensorflow/c/experimental/saved_model/core/ops/variable_ops.h
@@ -16,9 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_OPS_VARIABLE_OPS_H
 #define TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_OPS_VARIABLE_OPS_H
 
-#include "tensorflow/c/eager/context_interface.h"
-#include "tensorflow/c/eager/tensor_handle_interface.h"
-#include "tensorflow/c/experimental/saved_model/core/ops/owned_tensor_handle.h"
+#include "tensorflow/c/eager/immediate_execution_context.h"
+#include "tensorflow/c/eager/immediate_execution_tensor_handle.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/platform/status.h"
@@ -30,31 +29,31 @@ namespace internal {
 // TensorHandle associated with the variable. This is equivalent to creating an
 // unitialized TF2 tf.Variable.
 // https://github.com/tensorflow/tensorflow/blob/516608035f85cec8b126712b0ff8407220206b22/tensorflow/python/ops/resource_variable_ops.py#L1867-L1872
-Status CreateUninitializedResourceVariable(AbstractContextInterface* ctx,
+Status CreateUninitializedResourceVariable(ImmediateExecutionContext* ctx,
                                            DataType dtype, TensorShape shape,
-                                           AbstractTensorHandlePtr* handle);
+                                           ImmediateTensorHandlePtr* handle);
 
 // Executes an AssignVariableOp using `ctx`, assigning the variable associated
 // with `variable_handle` with `value`. `dtype` must be the datatype of the
 // underlying variable for `variable_handle`. Note that it is illegal to assign
 // a variable to a Tensor with a different dtype than what the variable was
 // created with.
-Status AssignVariable(AbstractContextInterface* ctx,
-                      AbstractTensorHandleInterface* variable_handle,
-                      DataType dtype, AbstractTensorHandleInterface* value);
+Status AssignVariable(ImmediateExecutionContext* ctx,
+                      ImmediateExecutionTensorHandle* variable_handle,
+                      DataType dtype, ImmediateExecutionTensorHandle* value);
 
 // Executes a ReadVariableOp using `ctx`. This reads the underlying variable
 // value of `variable_handle` and copies the value to `output`. `dtype` must be
 // the dtype of the variable associated with `variable_handle`.
-Status ReadVariable(AbstractContextInterface* ctx,
-                    AbstractTensorHandleInterface* variable_handle,
-                    DataType dtype, AbstractTensorHandlePtr* output);
+Status ReadVariable(ImmediateExecutionContext* ctx,
+                    ImmediateExecutionTensorHandle* variable_handle,
+                    DataType dtype, ImmediateTensorHandlePtr* output);
 
 // Executes DestroyResourceOp on `handle`, using `ctx`. This is equivalent to
 // the cleanup that occurs in a tf.Variable's EagerResourceDeleter:
 // https://github.com/tensorflow/tensorflow/blob/516608035f85cec8b126712b0ff8407220206b22/tensorflow/python/ops/resource_variable_ops.py#L289-L290
-Status DestroyResource(AbstractContextInterface* ctx,
-                       AbstractTensorHandleInterface* handle);
+Status DestroyResource(ImmediateExecutionContext* ctx,
+                       ImmediateExecutionTensorHandle* handle);
 
 }  // namespace internal
 }  // namespace tensorflow
diff --git a/tensorflow/c/experimental/saved_model/core/ops/variable_ops_test.cc b/tensorflow/c/experimental/saved_model/core/ops/variable_ops_test.cc
index 3c57ed4d38a..09c45332efc 100644
--- a/tensorflow/c/experimental/saved_model/core/ops/variable_ops_test.cc
+++ b/tensorflow/c/experimental/saved_model/core/ops/variable_ops_test.cc
@@ -17,9 +17,8 @@ limitations under the License.
 
 #include <memory>
 
-#include "tensorflow/c/experimental/saved_model/core/ops/owned_eager_context.h"
-#include "tensorflow/c/experimental/saved_model/core/ops/owned_tensor.h"
-#include "tensorflow/c/experimental/saved_model/core/ops/owned_tensor_handle.h"
+#include "tensorflow/c/eager/immediate_execution_tensor_handle.h"
+#include "tensorflow/c/tensor_interface.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/common_runtime/eager/context.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -30,10 +29,10 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
-AbstractTensorHandlePtr CreateScalarTensorHandle(EagerContext* context,
-                                                 float value) {
+ImmediateTensorHandlePtr CreateScalarTensorHandle(EagerContext* context,
+                                                  float value) {
   AbstractTensorPtr tensor(context->CreateFloatScalar(value));
-  AbstractTensorHandlePtr handle(context->CreateLocalHandle(tensor.get()));
+  ImmediateTensorHandlePtr handle(context->CreateLocalHandle(tensor.get()));
   return handle;
 }
 
@@ -62,7 +61,7 @@ class VariableOpsTest : public ::testing::Test {
 // Sanity check for variable creation
 TEST_F(VariableOpsTest, CreateVariableSuccessful) {
   // Create a DT_Resource TensorHandle that points to a scalar DT_FLOAT tensor
-  AbstractTensorHandlePtr handle;
+  ImmediateTensorHandlePtr handle;
   TF_EXPECT_OK(internal::CreateUninitializedResourceVariable(
       context(), DT_FLOAT, {}, &handle));
   // The created TensorHandle should be a DT_Resource
@@ -72,7 +71,7 @@ TEST_F(VariableOpsTest, CreateVariableSuccessful) {
 // Sanity check for variable destruction
 TEST_F(VariableOpsTest, DestroyVariableSuccessful) {
   // Create a DT_Resource TensorHandle that points to a scalar DT_FLOAT tensor
-  AbstractTensorHandlePtr handle;
+  ImmediateTensorHandlePtr handle;
   TF_EXPECT_OK(internal::CreateUninitializedResourceVariable(
       context(), DT_FLOAT, {}, &handle));
 
@@ -83,18 +82,18 @@ TEST_F(VariableOpsTest, DestroyVariableSuccessful) {
 // Sanity check for handle assignment and reading
 TEST_F(VariableOpsTest, AssignVariableAndReadSuccessful) {
   // Create a DT_Resource TensorHandle that points to a scalar DT_FLOAT tensor
-  AbstractTensorHandlePtr variable;
+  ImmediateTensorHandlePtr variable;
   TF_EXPECT_OK(internal::CreateUninitializedResourceVariable(
       context(), DT_FLOAT, {}, &variable));
 
   // Create a Scalar float TensorHandle with value 42, and assign it to
   // the variable.
-  AbstractTensorHandlePtr my_value = CreateScalarTensorHandle(context(), 42.0);
+  ImmediateTensorHandlePtr my_value = CreateScalarTensorHandle(context(), 42.0);
   TF_EXPECT_OK(internal::AssignVariable(context(), variable.get(), DT_FLOAT,
                                         my_value.get()));
 
   // Read back the value from the variable, and check that it is 42.
-  AbstractTensorHandlePtr read_value_handle;
+  ImmediateTensorHandlePtr read_value_handle;
   TF_EXPECT_OK(internal::ReadVariable(context(), variable.get(), DT_FLOAT,
                                       &read_value_handle));
   Status status;
diff --git a/tensorflow/c/experimental/saved_model/internal/BUILD b/tensorflow/c/experimental/saved_model/internal/BUILD
index 72474940c16..888c284bb12 100644
--- a/tensorflow/c/experimental/saved_model/internal/BUILD
+++ b/tensorflow/c/experimental/saved_model/internal/BUILD
@@ -178,7 +178,7 @@ cc_library(
         ":tensorhandle_list_type",
         "//tensorflow/c:c_api_macros",
         "//tensorflow/c/eager:c_api",
-        "//tensorflow/c/eager:tensor_handle_interface",
+        "//tensorflow/c/eager:immediate_execution_tensor_handle",
         "//tensorflow/c/eager:tfe_tensorhandle_internal",
     ],
 )
@@ -190,7 +190,7 @@ cc_library(
     ],
     deps = [
         "//tensorflow/c:conversion_macros",
-        "//tensorflow/c/eager:tensor_handle_interface",
+        "//tensorflow/c/eager:immediate_execution_tensor_handle",
     ],
 )
 
diff --git a/tensorflow/c/experimental/saved_model/internal/tensorhandle_list.cc b/tensorflow/c/experimental/saved_model/internal/tensorhandle_list.cc
index 7d018658101..c8f00c1f7c0 100644
--- a/tensorflow/c/experimental/saved_model/internal/tensorhandle_list.cc
+++ b/tensorflow/c/experimental/saved_model/internal/tensorhandle_list.cc
@@ -17,7 +17,7 @@ limitations under the License.
 
 #include <stddef.h>
 
-#include "tensorflow/c/eager/tensor_handle_interface.h"
+#include "tensorflow/c/eager/immediate_execution_tensor_handle.h"
 #include "tensorflow/c/eager/tfe_tensorhandle_internal.h"
 #include "tensorflow/c/experimental/saved_model/internal/tensorhandle_list_type.h"
 
diff --git a/tensorflow/c/experimental/saved_model/internal/tensorhandle_list_type.h b/tensorflow/c/experimental/saved_model/internal/tensorhandle_list_type.h
index 8cbec2806a8..566417df025 100644
--- a/tensorflow/c/experimental/saved_model/internal/tensorhandle_list_type.h
+++ b/tensorflow/c/experimental/saved_model/internal/tensorhandle_list_type.h
@@ -19,7 +19,7 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/c/conversion_macros.h"
-#include "tensorflow/c/eager/tensor_handle_interface.h"
+#include "tensorflow/c/eager/immediate_execution_tensor_handle.h"
 
 // Internal structures used by the SavedModel C API. These are likely to
 // change and should not be depended on.
@@ -29,7 +29,7 @@ typedef struct TF_TensorHandleList TF_TensorHandleList;
 namespace tensorflow {
 
 DEFINE_CONVERSION_FUNCTIONS(
-    std::vector<tensorflow::AbstractTensorHandleInterface*>,
+    std::vector<tensorflow::ImmediateExecutionTensorHandle*>,
     TF_TensorHandleList)
 
 }  // namespace tensorflow
diff --git a/tensorflow/c/tensor_interface.h b/tensorflow/c/tensor_interface.h
index eb0d28b0bf9..d165c84980c 100644
--- a/tensorflow/c/tensor_interface.h
+++ b/tensorflow/c/tensor_interface.h
@@ -54,6 +54,20 @@ class AbstractTensorInterface {
   virtual ~AbstractTensorInterface() {}
 };
 
+namespace internal {
+struct AbstractTensorInterfaceDeleter {
+  void operator()(AbstractTensorInterface* p) const {
+    if (p != nullptr) {
+      p->Release();
+    }
+  }
+};
+}  // namespace internal
+
+using AbstractTensorPtr =
+    std::unique_ptr<AbstractTensorInterface,
+                    internal::AbstractTensorInterfaceDeleter>;
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_C_TENSOR_INTERFACE_H_
diff --git a/tensorflow/cc/gradients/nn_grad_test.cc b/tensorflow/cc/gradients/nn_grad_test.cc
index 942ec08f451..f5a09e09dcd 100644
--- a/tensorflow/cc/gradients/nn_grad_test.cc
+++ b/tensorflow/cc/gradients/nn_grad_test.cc
@@ -259,9 +259,6 @@ TEST_F(NNGradTest, MaxPoolGradV2Helper) {
   RunTest(x, x_init_value, y, y_shape);
 }
 
-// TODO(rocm):
-// Re-enable this test once 3D pooling is supported on ROCm platform
-#ifndef TENSORFLOW_USE_ROCM
 TEST_F(NNGradTest, MaxPool3DGradHelper) {
   TensorShape x_shape({1, 3, 3, 3, 1});
   TensorShape y_shape({1, 1, 1, 1, 1});
@@ -274,7 +271,6 @@ TEST_F(NNGradTest, MaxPool3DGradHelper) {
   SetRandomValuesForMaxPooling<float>(&x_init_value);
   RunTest(x, x_init_value, y, y_shape);
 }
-#endif
 
 TEST_F(NNGradTest, AvgPoolGradHelper) {
   TensorShape x_shape({1, 2, 2, 1});
@@ -287,9 +283,6 @@ TEST_F(NNGradTest, AvgPoolGradHelper) {
   RunTest(x, x_shape, y, y_shape);
 }
 
-// TODO(rocm):
-// Re-enable this test once 3D pooling is supported on ROCm platform
-#ifndef TENSORFLOW_USE_ROCM
 TEST_F(NNGradTest, AvgPool3DGradHelper) {
   TensorShape x_shape({1, 3, 3, 3, 1});
   TensorShape y_shape({1, 1, 1, 1, 1});
@@ -300,7 +293,6 @@ TEST_F(NNGradTest, AvgPool3DGradHelper) {
   auto y = AvgPool3D(scope_, x, ksize, strides, "SAME");
   RunTest(x, x_shape, y, y_shape);
 }
-#endif
 
 TEST_F(NNGradTest, LRN) {
   TensorShape x_shape({1, 1, 2, 1});
diff --git a/tensorflow/compiler/jit/kernels/xla_ops.cc b/tensorflow/compiler/jit/kernels/xla_ops.cc
index 0fc1a349adc..e3542586c89 100644
--- a/tensorflow/compiler/jit/kernels/xla_ops.cc
+++ b/tensorflow/compiler/jit/kernels/xla_ops.cc
@@ -108,8 +108,7 @@ class XlaExecutableClosure {
   explicit XlaExecutableClosure(
       xla::LocalClient* client, xla::LocalExecutable* executable,
       const XlaCompiler::CompilationResult* compilation_result,
-      std::map<int, OptionalTensor> resource_var_snapshots,
-      int num_constant_args)
+      ResourceVarsSnapshot resource_var_snapshots, int num_constant_args)
       : client_(client),
         executable_(executable),
         compilation_result_(compilation_result),
@@ -124,7 +123,7 @@ class XlaExecutableClosure {
   const XlaCompiler::CompilationResult* compilation_result() const {
     return compilation_result_;
   }
-  const std::map<int, OptionalTensor>& resource_var_snapshots() const {
+  const ResourceVarsSnapshot& resource_var_snapshots() const {
     return resource_var_snapshots_;
   }
   int num_constant_args() const { return num_constant_args_; }
@@ -133,7 +132,7 @@ class XlaExecutableClosure {
   xla::LocalClient* client_;
   xla::LocalExecutable* executable_;
   const XlaCompiler::CompilationResult* compilation_result_;
-  std::map<int, OptionalTensor> resource_var_snapshots_;
+  ResourceVarsSnapshot resource_var_snapshots_;
   int num_constant_args_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(XlaExecutableClosure);
@@ -276,10 +275,10 @@ static Status BuildCompilationCache(OpKernelContext* ctx,
 
 static Status CompileToLocalExecutable(
     OpKernelContext* ctx, const NameAttrList& function, bool has_ref_vars,
-    const XlaPlatformInfo& platform_info, absl::Span<const int> resources,
+    const XlaPlatformInfo& platform_info,
+    absl::Span<VariableInfo const> variable_infos,
     absl::Span<const int> constants, bool lazy, xla::LocalClient** client,
-    std::map<int, OptionalTensor>* variables,
-    const XlaCompiler::CompilationResult** kernel,
+    const XlaCompiler::CompilationResult** compilation_result,
     xla::LocalExecutable** executable) {
   // We store information about the JIT-compiled XLA computation
   // in the ResourceMgr.
@@ -299,7 +298,6 @@ static Status CompileToLocalExecutable(
   // this is more obviously correct.)
   core::ScopedUnref cache_ref(cache);
 
-  TF_RETURN_IF_ERROR(SnapshotResourceVariables(ctx, resources, variables));
   *client = static_cast<xla::LocalClient*>(cache->client());
 
   absl::optional<se::TfAllocatorAdapter> tf_allocator_adapter;
@@ -337,11 +335,11 @@ static Status CompileToLocalExecutable(
 
   std::vector<XlaCompiler::Argument> args;
   TF_RETURN_IF_ERROR(XlaComputationLaunchContext::BuildXlaCompilerArguments(
-      constant_args, *variables, ctx, &args));
+      constant_args, variable_infos, ctx, &args));
   return cache->Compile(options, function, args, compile_options,
                         lazy ? XlaCompilationCache::CompileMode::kLazy
                              : XlaCompilationCache::CompileMode::kStrict,
-                        kernel, executable);
+                        compilation_result, executable);
 }
 
 void XlaLocalLaunchBase::Compute(OpKernelContext* ctx) {
@@ -349,16 +347,22 @@ void XlaLocalLaunchBase::Compute(OpKernelContext* ctx) {
           << Canonicalize(function_.name(), AttrSlice(&function_.attr()));
 
   xla::LocalClient* client;
-  const XlaCompiler::CompilationResult* kernel;
+  const XlaCompiler::CompilationResult* compilation_result;
   xla::LocalExecutable* executable;
-  std::map<int, OptionalTensor> variables;
 
+  ResourceVarsSnapshot variables;
   {
+    std::vector<VariableInfo> variable_infos;
+    OP_REQUIRES_OK(
+        ctx, GetVariableInfosFromCtxInputs(ctx, resources_, &variable_infos));
+    OP_REQUIRES_OK(ctx, LockVariables(absl::MakeSpan(variable_infos)));
     Status s = CompileToLocalExecutable(
         ctx, function_, /*has_ref_vars=*/has_ref_vars_, platform_info_,
-        resources_, constants_, /*lazy=*/false, &client, &variables, &kernel,
-        &executable);
+        variable_infos, constants_, /*lazy=*/false, &client,
+        &compilation_result, &executable);
     OP_REQUIRES_OK(ctx, s);
+    OP_REQUIRES_OK(ctx, SnapshotResourceVariables(ctx, resources_,
+                                                  variable_infos, &variables));
   }
 
   se::Stream* stream =
@@ -373,7 +377,7 @@ void XlaLocalLaunchBase::Compute(OpKernelContext* ctx) {
       client, allocator,
       /*allocate_xla_tensors=*/platform_info_.is_on_xla_device(),
       platform_info_.UseMultipleStreams());
-  launch_context.PopulateInputs(ctx, kernel, variables,
+  launch_context.PopulateInputs(ctx, compilation_result, variables,
                                 /*missing_ctx_input_prefix=*/0);
 
   // Execute the computation.
@@ -413,7 +417,7 @@ void XlaLocalLaunchBase::Compute(OpKernelContext* ctx) {
       executable->executable()->module().input_output_alias_config();
   OP_REQUIRES_OK(
       ctx, launch_context.PopulateOutputs(
-               ctx, kernel, run_result.ConsumeValueOrDie(),
+               ctx, compilation_result, run_result.ConsumeValueOrDie(),
                /*missing_ctx_input_prefix=*/0, input_output_alias, variables));
   VLOG(1) << "Done";
 }
@@ -494,7 +498,7 @@ void XlaCompileOp::Compute(OpKernelContext* ctx) {
   xla::LocalClient* client;
   const XlaCompiler::CompilationResult* kernel;
   xla::LocalExecutable* executable;
-  std::map<int, OptionalTensor> variables;
+  ResourceVarsSnapshot variables;
 
   bool cannot_compile_cluster;
   {
@@ -506,9 +510,16 @@ void XlaCompileOp::Compute(OpKernelContext* ctx) {
       cannot_compile_cluster) {
     executable = nullptr;
   } else {
+    std::vector<VariableInfo> variable_infos;
+    OP_REQUIRES_OK(
+        ctx, GetVariableInfosFromCtxInputs(ctx, resources_, &variable_infos));
+    OP_REQUIRES_OK(ctx, LockVariables(absl::MakeSpan(variable_infos)));
     Status status = CompileToLocalExecutable(
-        ctx, function_, has_ref_vars_, platform_info_, resources_, constants_,
-        /*lazy=*/!must_compile_, &client, &variables, &kernel, &executable);
+        ctx, function_, has_ref_vars_, platform_info_, variable_infos,
+        constants_,
+        /*lazy=*/!must_compile_, &client, &kernel, &executable);
+    OP_REQUIRES_OK(ctx, SnapshotResourceVariables(ctx, resources_,
+                                                  variable_infos, &variables));
     if (must_compile_ || status.code() != error::UNIMPLEMENTED) {
       OP_REQUIRES_OK(ctx, status);
     }
diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass.cc b/tensorflow/compiler/jit/mark_for_compilation_pass.cc
index 9f5723f4fa4..dc5df94e963 100644
--- a/tensorflow/compiler/jit/mark_for_compilation_pass.cc
+++ b/tensorflow/compiler/jit/mark_for_compilation_pass.cc
@@ -1837,7 +1837,7 @@ absl::flat_hash_map<string, std::vector<string>>* GetWhitelistTable() {
       "ConcatOffset", "Const", "MirrorPad", "Pack", "Pad", "PadV2", "Reverse",
       "ReverseV2", "ReverseSequence", "Slice", "Split", "SplitV",
       "StridedSlice", "StridedSliceGrad", "ResourceStridedSliceAssign",
-      "Tile", "Transpose", "InvertPermutation", "Unpack"}}};
+      "Tile", "Transpose", "InvertPermutation", "Unpack", "DeviceIndex"}}};
   // clang-format on
   return result;
 }
diff --git a/tensorflow/compiler/jit/xla_compile_on_demand_op.cc b/tensorflow/compiler/jit/xla_compile_on_demand_op.cc
index e1ad0e8c5af..afaee614f02 100644
--- a/tensorflow/compiler/jit/xla_compile_on_demand_op.cc
+++ b/tensorflow/compiler/jit/xla_compile_on_demand_op.cc
@@ -28,32 +28,23 @@ limitations under the License.
 
 namespace tensorflow {
 
-namespace {
-std::map<int, OptionalTensor> GetVariables(OpKernelContext* ctx) {
-  std::map<int, OptionalTensor> variables;
-  for (int64 i = 0; i < ctx->num_inputs(); ++i) {
+// Returns argument indices corresponding to the resource variable inputs of
+// kernel context `ctx`.
+static std::vector<int> GetResourceVariableIndices(OpKernelContext* ctx) {
+  std::vector<int> out;
+  for (int64 i = 0; i < ctx->num_inputs(); i++) {
     if (ctx->input(i).dtype() == DT_RESOURCE) {
-      core::RefCountPtr<Var> variable;
-      ResourceHandle handle = HandleFromInput(ctx, i);
-      OptionalTensor& optional = variables[i];
-      optional.name = handle.name();
-      if (LookupResource(ctx, handle, &variable).ok()) {
-        tf_shared_lock lock(*variable->mu());
-        optional.present = true;
-        optional.value = *variable->tensor();
-      }
+      out.push_back(i);
     }
   }
-  return variables;
+  return out;
 }
-}  // namespace
 
 Status XlaCompileOnDemandOp::Run(OpKernelContext* ctx,
                                  const XlaDevice::Metadata& metadata,
                                  const XlaCompiler::CompilationResult* result,
-                                 xla::LocalExecutable* executable) {
-  std::map<int, OptionalTensor> variables = GetVariables(ctx);
-
+                                 xla::LocalExecutable* executable,
+                                 const ResourceVarsSnapshot& variable_args) {
   xla::LocalClient* client = metadata.client();
 
   // Builds an XLA allocator for the device.
@@ -62,7 +53,7 @@ Status XlaCompileOnDemandOp::Run(OpKernelContext* ctx,
       /*allocate_xla_tensors=*/true,
       /*use_multiple_streams=*/metadata.UseMultipleStreams());
 
-  launch_context.PopulateInputs(ctx, result, variables,
+  launch_context.PopulateInputs(ctx, result, variable_args,
                                 /*missing_ctx_input_prefix=*/0);
 
   se::Stream* stream =
@@ -87,7 +78,7 @@ Status XlaCompileOnDemandOp::Run(OpKernelContext* ctx,
       executable->executable()->module().input_output_alias_config();
   TF_RETURN_IF_ERROR(launch_context.PopulateOutputs(
       ctx, result, run_result.ConsumeValueOrDie(),
-      /*missing_ctx_input_prefix=*/0, input_output_alias, variables));
+      /*missing_ctx_input_prefix=*/0, input_output_alias, variable_args));
   return Status::OK();
 }
 
@@ -115,7 +106,7 @@ Status XlaCompileOnDemandOp::ShouldArgumentBeConstant(
 Status XlaCompileOnDemandOp::Compile(
     OpKernelContext* ctx, const XlaDevice::Metadata& metadata,
     const XlaCompiler::CompilationResult** result,
-    xla::LocalExecutable** executable) {
+    ResourceVarsSnapshot* variable_args, xla::LocalExecutable** executable) {
   std::map<int, Tensor> constant_arguments;
   for (int64 i = 0; i < ctx->num_inputs(); ++i) {
     const Tensor& device_tensor = ctx->input(i);
@@ -190,12 +181,18 @@ Status XlaCompileOnDemandOp::Compile(
   // rather than a one-element tuple.
   compile_options.always_return_tuple = false;
 
-  std::map<int, OptionalTensor> variable_args = GetVariables(ctx);
-
+  std::vector<int> variables_indices = GetResourceVariableIndices(ctx);
   std::vector<XlaCompiler::Argument> args;
-
-  TF_RETURN_IF_ERROR(XlaComputationLaunchContext::BuildXlaCompilerArguments(
-      constant_arguments, variable_args, ctx, &args));
+  {
+    std::vector<VariableInfo> variable_infos;
+    TF_RETURN_IF_ERROR(
+        GetVariableInfosFromCtxInputs(ctx, variables_indices, &variable_infos));
+    TF_RETURN_IF_ERROR(LockVariables(absl::MakeSpan(variable_infos)));
+    TF_RETURN_IF_ERROR(SnapshotResourceVariables(
+        ctx, variables_indices, variable_infos, variable_args));
+    TF_RETURN_IF_ERROR(XlaComputationLaunchContext::BuildXlaCompilerArguments(
+        constant_arguments, variable_infos, ctx, &args));
+  }
 
   return cache->CompileSingleOp(options, args, ctx, compile_options, result,
                                 executable);
@@ -206,8 +203,10 @@ void XlaCompileOnDemandOp::Compute(OpKernelContext* ctx) {
   xla::LocalExecutable* executable;
   const XlaDevice::Metadata* metadata;
   OP_REQUIRES_OK(ctx, XlaDevice::GetMetadata(ctx, &metadata));
-  OP_REQUIRES_OK(ctx, Compile(ctx, *metadata, &result, &executable));
-  OP_REQUIRES_OK(ctx, Run(ctx, *metadata, result, executable));
+  ResourceVarsSnapshot variable_args;
+  OP_REQUIRES_OK(ctx,
+                 Compile(ctx, *metadata, &result, &variable_args, &executable));
+  OP_REQUIRES_OK(ctx, Run(ctx, *metadata, result, executable, variable_args));
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/xla_compile_on_demand_op.h b/tensorflow/compiler/jit/xla_compile_on_demand_op.h
index 98f634db98f..cc5f2f1e42f 100644
--- a/tensorflow/compiler/jit/xla_compile_on_demand_op.h
+++ b/tensorflow/compiler/jit/xla_compile_on_demand_op.h
@@ -20,6 +20,7 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_JIT_XLA_COMPILE_ON_DEMAND_OP_H_
 
 #include "tensorflow/compiler/jit/xla_device.h"
+#include "tensorflow/compiler/jit/xla_launch_util.h"
 #include "tensorflow/compiler/tf2xla/xla_compiler.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/core/framework/function.h"
@@ -47,10 +48,12 @@ class XlaCompileOnDemandOp : public OpKernel {
                                 bool* result);
   Status Compile(OpKernelContext* ctx, const XlaDevice::Metadata& metadata,
                  const XlaCompiler::CompilationResult** result,
+                 ResourceVarsSnapshot* variable_args,
                  xla::LocalExecutable** executable);
   Status Run(OpKernelContext* ctx, const XlaDevice::Metadata& metadata,
              const XlaCompiler::CompilationResult* result,
-             xla::LocalExecutable* executable);
+             xla::LocalExecutable* executable,
+             const ResourceVarsSnapshot& variable_args);
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/xla_launch_util.cc b/tensorflow/compiler/jit/xla_launch_util.cc
index 25eed134e35..eb31b23c991 100644
--- a/tensorflow/compiler/jit/xla_launch_util.cc
+++ b/tensorflow/compiler/jit/xla_launch_util.cc
@@ -52,7 +52,8 @@ const char kPossibleNonVariableResourceHintMessage[] =
     "resource inputs to XLA.";
 }  // anonymous namespace
 
-VariableInfo::VariableInfo(int index, Var* var) : index_(index), var_(var) {}
+VariableInfo::VariableInfo(int index, absl::string_view name, Var* var)
+    : index_(index), name_(name), var_(var) {}
 VariableInfo::VariableInfo(VariableInfo&& other)
     : index_(other.index_), var_(other.var_), lock_held_(other.lock_held_) {
   other.index_ = -1;
@@ -87,16 +88,15 @@ VariableInfo::~VariableInfo() {
 // Returns a vector of VariableInfo instances for the resource variable inputs
 // to the kernel with context `ctx`.  The input indices for the resource
 // variable inputs are in `variable_indices`.
-static Status GetVariableInfosFromCtxInputs(
-    OpKernelContext* ctx, absl::Span<const int> variable_indices,
-    std::vector<VariableInfo>* result) {
+Status GetVariableInfosFromCtxInputs(OpKernelContext* ctx,
+                                     absl::Span<const int> variable_indices,
+                                     std::vector<VariableInfo>* result) {
   std::vector<const ResourceHandle*> resource_handles;
   absl::c_transform(
       variable_indices, std::back_inserter(resource_handles),
       [&](int variable_idx) { return &HandleFromInput(ctx, variable_idx); });
 
   std::vector<core::RefCountPtr<Var>> variables;
-
   Status s = LookupResources(ctx, resource_handles, &variables);
   if (!s.ok()) {
     errors::AppendToMessage(&s, kPossibleNonVariableResourceHintMessage);
@@ -109,7 +109,9 @@ static Status GetVariableInfosFromCtxInputs(
     // *Release* the variable because we're going to unref it later in
     // ~VariableInfo.
     Var* variable = variables[i].release();
-    result->emplace_back(variable_indices[i], variable);
+    int input_idx = variable_indices[i];
+    std::string var_name = HandleFromInput(ctx, input_idx).name();
+    result->emplace_back(input_idx, var_name, variable);
   }
 
   return Status::OK();
@@ -162,21 +164,12 @@ Status LockVariables(absl::Span<VariableInfo> variables) {
 
 Status SnapshotResourceVariables(OpKernelContext* ctx,
                                  absl::Span<const int> variable_indices,
-                                 std::map<int, OptionalTensor>* result) {
-  std::vector<VariableInfo> variable_infos;
-  TF_RETURN_IF_ERROR(
-      GetVariableInfosFromCtxInputs(ctx, variable_indices, &variable_infos));
-  TF_RETURN_IF_ERROR(LockVariables(absl::MakeSpan(variable_infos)));
-
+                                 absl::Span<VariableInfo const> variable_infos,
+                                 ResourceVarsSnapshot* result) {
   for (int i = 0; i < variable_indices.size(); i++) {
-    if (variable_infos[i].var()) {
-      OptionalTensor& tensor = (*result)[variable_indices[i]];
-      tensor.name = HandleFromInput(ctx, variable_indices[i]).name();
-      tensor.present = true;
-      tensor.value = *variable_infos[i].var()->tensor();
-    } else {
-      (*result)[variable_indices[i]] = OptionalTensor();
-    }
+    Var* var = variable_infos[i].var();
+    (*result)[variable_indices[i]] =
+        var ? absl::make_optional(*var->tensor()) : absl::nullopt;
   }
   return Status::OK();
 }
@@ -197,8 +190,7 @@ XlaComputationLaunchContext::XlaComputationLaunchContext(
 void XlaComputationLaunchContext::PopulateInputs(
     OpKernelContext* ctx,
     const XlaCompiler::CompilationResult* compilation_result,
-    const std::map<int, OptionalTensor>& variables,
-    int missing_ctx_input_prefix) {
+    const ResourceVarsSnapshot& variables, int missing_ctx_input_prefix) {
   // Build ShapedBuffers that point directly to the Tensor buffers.
   arg_ptrs_ =
       std::vector<ShapedBuffer*>(compilation_result->xla_input_shapes.size());
@@ -210,7 +202,7 @@ void XlaComputationLaunchContext::PopulateInputs(
     CHECK_GE(arg_num, missing_ctx_input_prefix);
     const xla::Shape& shape = compilation_result->xla_input_shapes[i];
     const Tensor* t = variables.count(arg_num)
-                          ? &(variables.at(arg_num).value)
+                          ? &(variables.at(arg_num).value())
                           : &(ctx->input(arg_num - missing_ctx_input_prefix));
     CHECK(t);
 
@@ -262,7 +254,7 @@ static const Tensor* FindAliasedTensorForOutput(
     int output_num, OpKernelContext* ctx, int missing_ctx_input_prefix,
     const xla::HloInputOutputAliasConfig& input_output_alias,
     absl::Span<const int> input_mapping,
-    const std::map<int, OptionalTensor>& resource_var_snapshots) {
+    const ResourceVarsSnapshot& resource_var_snapshots) {
   if (MustAliasOutput(input_output_alias, output_num)) {
     int xla_param = input_output_alias.GetAliasedParameter({output_num})
                         .value()
@@ -274,8 +266,8 @@ static const Tensor* FindAliasedTensorForOutput(
     // entry time.
     if (input_tensor->dtype() == DT_RESOURCE) {
       auto& v = resource_var_snapshots.at(missing_ctx_input_prefix + tf_param);
-      CHECK(v.present);
-      return &v.value;
+      CHECK(v.has_value());
+      return &v.value();
     }
     return input_tensor;
   }
@@ -298,9 +290,9 @@ static Tensor GetOrCreateTensorForOutput(
     int output_num, OpKernelContext* ctx, int missing_ctx_input_prefix,
     const xla::HloInputOutputAliasConfig& input_output_alias,
     absl::Span<const int> input_mapping,
-    const std::map<int, OptionalTensor>& resource_var_snapshots,
-    DataType output_dtype, const TensorShape& output_shape,
-    se::DeviceMemoryBase output_buffer, Allocator* output_allocator) {
+    const ResourceVarsSnapshot& resource_var_snapshots, DataType output_dtype,
+    const TensorShape& output_shape, se::DeviceMemoryBase output_buffer,
+    Allocator* output_allocator) {
   if (const Tensor* aliased_tensor = FindAliasedTensorForOutput(
           output_num, ctx, missing_ctx_input_prefix, input_output_alias,
           input_mapping, resource_var_snapshots)) {
@@ -431,13 +423,13 @@ static xla::StatusOr<std::vector<VariableInfo>> GatherVariableInfo(
     // TODO(b/35625933): tensorflow::Var should contain a PersistentTensor,
     // not a Tensor.
     Var* variable = nullptr;
-    TF_RETURN_IF_ERROR(LookupOrCreateResource<Var>(
-        ctx, HandleFromInput(ctx, actual_input_index), &variable,
-        [&write](Var** ptr) {
-          *ptr = new Var(write.type);
-          return Status::OK();
-        }));
-    variable_infos.emplace_back(actual_input_index, variable);
+    const ResourceHandle handle = HandleFromInput(ctx, actual_input_index);
+    TF_RETURN_IF_ERROR(LookupOrCreateResource<Var>(ctx, handle, &variable,
+                                                   [&write](Var** ptr) {
+                                                     *ptr = new Var(write.type);
+                                                     return Status::OK();
+                                                   }));
+    variable_infos.emplace_back(actual_input_index, handle.name(), variable);
   }
   return variable_infos;
 }
@@ -447,7 +439,7 @@ Status XlaComputationLaunchContext::PopulateOutputs(
     const XlaCompiler::CompilationResult* compilation_result,
     ScopedShapedBuffer output, int missing_ctx_input_prefix,
     const xla::HloInputOutputAliasConfig& input_output_alias,
-    const std::map<int, OptionalTensor>& resource_var_snapshots) {
+    const ResourceVarsSnapshot& resource_var_snapshots) {
   se::Stream* stream =
       ctx->op_device_context() ? ctx->op_device_context()->stream() : nullptr;
   Allocator* allocator = ctx->device()->GetAllocator({});
@@ -484,10 +476,36 @@ Status XlaComputationLaunchContext::PopulateOutputs(
     stream->ThenRecordEvent(definition_event.get());
   }
 
+  std::vector<TensorShape> output_tensor_shapes;
+  output_tensor_shapes.reserve(ctx->num_outputs());
+  if (output.on_host_shape().is_dynamic()) {
+    TF_ASSIGN_OR_RETURN(
+        auto transfer_manager,
+        xla::TransferManager::GetForPlatform(stream->parent()->platform()));
+
+    xla::Shape output_host_shape = output.on_host_shape();
+    xla::Shape output_device_shape = output.on_device_shape();
+    TF_RETURN_IF_ERROR(transfer_manager->ReadDynamicShapes(
+        stream, &output, &output_host_shape, &output_device_shape));
+
+    output.set_shapes(output_host_shape, output_device_shape);
+    for (int i = 0; i < ctx->num_outputs(); ++i) {
+      const xla::Shape& subshape =
+          xla::ShapeUtil::GetSubshape(output_host_shape, {i});
+      TensorShape shape;
+      TF_RETURN_IF_ERROR(XLAShapeToTensorShape(subshape, &shape));
+      output_tensor_shapes.push_back(shape);
+    }
+  } else {
+    for (int i = 0; i < ctx->num_outputs(); ++i) {
+      output_tensor_shapes.push_back(compilation_result->outputs[i].shape);
+    }
+  }
+
   // Copy XLA results to the OpOutputList.
   int output_num = 0;
   for (int i = 0; i < ctx->num_outputs(); ++i) {
-    const TensorShape& shape = compilation_result->outputs[i].shape;
+    const TensorShape& shape = output_tensor_shapes[i];
     const DataType& type = compilation_result->outputs[i].type;
     VLOG(2) << "Retval " << i << " shape " << shape.DebugString() << " type "
             << DataTypeString(type);
@@ -564,12 +582,21 @@ Status XlaComputationLaunchContext::PopulateOutputs(
 
 Status XlaComputationLaunchContext::BuildXlaCompilerArguments(
     const std::map<int, Tensor>& constant_args,
-    const std::map<int, OptionalTensor>& variable_args, OpKernelContext* ctx,
+    absl::Span<VariableInfo const> variable_args, OpKernelContext* ctx,
     std::vector<XlaCompiler::Argument>* args) {
   args->resize(ctx->num_inputs());
 
+  absl::flat_hash_map<int, const VariableInfo*> variable_info_lookup;
+  for (const VariableInfo& info : variable_args) {
+    CHECK(!info.var() || info.lock_held())
+        << "Need to hold the lock on resource variables "
+           "before calling BuildXlaCompilerArguments";
+    variable_info_lookup.emplace(info.index(), &info);
+  }
+
   for (int64 input_num = 0; input_num < ctx->num_inputs(); ++input_num) {
     XlaCompiler::Argument& arg = (*args)[input_num];
+
     if (constant_args.count(input_num) > 0) {
       // Handles compile-time constants.
       const Tensor& input = constant_args.at(input_num);
@@ -578,7 +605,7 @@ Status XlaComputationLaunchContext::BuildXlaCompilerArguments(
       arg.type = input.dtype();
       arg.shape = input.shape();
       arg.constant_value = input;
-    } else if (variable_args.count(input_num) == 0) {
+    } else if (variable_info_lookup.count(input_num) == 0) {
       // Handles the non-constant arguments.
       const Tensor& input = ctx->input(input_num);
       TF_RET_CHECK(input.dtype() != DT_RESOURCE);
@@ -594,14 +621,14 @@ Status XlaComputationLaunchContext::BuildXlaCompilerArguments(
       // Handles resource variables.
       const Tensor& input = ctx->input(input_num);
       TF_RET_CHECK(input.dtype() == DT_RESOURCE);
-      const OptionalTensor& variable = variable_args.at(input_num);
-      arg.name = variable.name;
+      const VariableInfo& variable = *variable_info_lookup[input_num];
+      arg.name = std::string(variable.name());
       arg.kind = XlaCompiler::Argument::kResource;
       arg.resource_kind = XlaResource::kVariable;
-      if (variable.present) {
-        const Tensor& value = variable.value;
-        arg.type = value.dtype();
-        arg.shape = value.shape();
+      if (variable.var()) {
+        const Tensor* value = variable.var()->tensor();
+        arg.type = value->dtype();
+        arg.shape = value->shape();
         arg.initialized = true;
       } else {
         // The values of uninitialized variables are not passed as inputs, since
diff --git a/tensorflow/compiler/jit/xla_launch_util.h b/tensorflow/compiler/jit/xla_launch_util.h
index 9a7f20cb310..92b6c4c8a08 100644
--- a/tensorflow/compiler/jit/xla_launch_util.h
+++ b/tensorflow/compiler/jit/xla_launch_util.h
@@ -34,36 +34,17 @@ limitations under the License.
 
 namespace tensorflow {
 
-// Struct that represents a possibly-absent Tensor.
-struct OptionalTensor {
-  string name;           // A descriptive name
-  bool present = false;  // Is the tensor present?
-  Tensor value;          // If present, what is the Tensor's value?
-};
-
-// Takes a snapshot of the values of resource variable arguments, whose indices
-// are specified in `variable_indices` argument. We snapshot tensors that back
-// resource variables since concurrent updates may modify the shape, and it is
-// important that the shapes used for compilation match the true shapes of the
-// buffers.
-//
-// We snapshot the entire set of resource variables as one atomic operation.
-// This models Read->* dependencies between resource variable operations.  See
-// jit/resource_operation_safety_analysis for details.
-//
-// Returns a map of TensorFlow argument index to resource variable. If a
-// resource variable is not initialized, the corresponding OptionalTensor
-// will have its `present` field set to false.
-Status SnapshotResourceVariables(OpKernelContext* ctx,
-                                 absl::Span<const int> variable_indices,
-                                 std::map<int, OptionalTensor>* result);
+// Snapshot of resource variables for a TF kernel invocation, mapping from
+// parameter number to values at execution time. If the resource variable is not
+// initialized, the value will not be present.
+using ResourceVarsSnapshot = absl::flat_hash_map<int, absl::optional<Tensor>>;
 
 // Information about the state of a variable passed as input to the _XlaCompile
 // and _XlaRun operators.  Unlocks the resource variable and decrements its
 // refcount on destruction.
 class VariableInfo {
  public:
-  explicit VariableInfo(int index, Var* var);
+  explicit VariableInfo(int index, absl::string_view name, Var* var);
   VariableInfo(VariableInfo&& other);
 
   VariableInfo& operator=(VariableInfo&& other);
@@ -79,6 +60,9 @@ class VariableInfo {
   // "empty", i.e. it does not track a resource variable.
   Var* var() const { return var_; }
 
+  // Returns the variable name.
+  absl::string_view name() const { return name_; }
+
   // Returns true if the resource variable lock was successfully acquired by
   // this thread.
   bool lock_held() const { return lock_held_; }
@@ -88,6 +72,7 @@ class VariableInfo {
 
  private:
   int index_;
+  std::string name_;
   Var* var_;
 
   // We can't use a optional<mutex_lock> here because it confuses the compiler's
@@ -96,6 +81,20 @@ class VariableInfo {
   bool lock_held_ = false;
 };
 
+// Takes a snapshot of the values of resource variable arguments, whose indices
+// are specified in `variable_indices` argument. We snapshot tensors that back
+// resource variables since concurrent updates may modify the shape, and it is
+// important that the shapes used for compilation match the true shapes of the
+// buffers.
+//
+// We snapshot the entire set of resource variables as one atomic operation.
+// This models Read->* dependencies between resource variable operations.  See
+// jit/resource_operation_safety_analysis for details.
+Status SnapshotResourceVariables(OpKernelContext* ctx,
+                                 absl::Span<const int> variable_indices,
+                                 absl::Span<VariableInfo const> variable_infos,
+                                 ResourceVarsSnapshot* result);
+
 // Acquires the mutexes for all the variables in `variables` using a
 // deadlock-safe protocol (acquire the mutexes in increasing-address order).
 //
@@ -104,6 +103,13 @@ class VariableInfo {
 Status LockVariables(absl::Span<VariableInfo> variables)
     TF_EXCLUSIVE_LOCK_FUNCTION();
 
+// Returns a vector of VariableInfo instances for the resource variable inputs
+// to the kernel with context `ctx`.  The input indices for the resource
+// variable inputs are in `variable_indices`.
+Status GetVariableInfosFromCtxInputs(OpKernelContext* ctx,
+                                     absl::Span<const int> variable_indices,
+                                     std::vector<VariableInfo>* result);
+
 // Helper class to perform the marshalling of TensorFlow inputs and outputs to
 // ShapedBuffers suitable for passing to an XLA computation.
 class XlaComputationLaunchContext {
@@ -123,9 +129,10 @@ class XlaComputationLaunchContext {
 
   // Builds a XlaCompiler::Argument vector from the arguments to an XlaLaunch
   // op.
+  // Precondition: variables in `variable_args` are locked.
   static Status BuildXlaCompilerArguments(
       const std::map<int, Tensor>& constant_args,
-      const std::map<int, OptionalTensor>& variable_args, OpKernelContext* ctx,
+      absl::Span<VariableInfo const> variable_args, OpKernelContext* ctx,
       std::vector<XlaCompiler::Argument>* args);
 
   // Add all inputs within `ctx` as XLA arguments (returned by arguments()).
@@ -137,7 +144,7 @@ class XlaComputationLaunchContext {
   // (in other words, no inputs actually required by the kernel can be missing).
   void PopulateInputs(OpKernelContext* ctx,
                       const XlaCompiler::CompilationResult* compilation_result,
-                      const std::map<int, OptionalTensor>& variables,
+                      const ResourceVarsSnapshot& variables,
                       int missing_ctx_input_prefix);
 
   // Given the XLA output in `output`, populate all outputs of `ctx`.  Also
@@ -155,7 +162,7 @@ class XlaComputationLaunchContext {
       const XlaCompiler::CompilationResult* compilation_result,
       xla::ScopedShapedBuffer output, int missing_ctx_input_prefix,
       const xla::HloInputOutputAliasConfig& input_output_alias,
-      const std::map<int, OptionalTensor>& resource_var_snapshots);
+      const ResourceVarsSnapshot& resource_var_snapshots);
 
   // Return the argument list. Only valid after PopulateInputs() has been
   // called.
diff --git a/tensorflow/compiler/mlir/g3doc/xla_gpu_codegen.md b/tensorflow/compiler/mlir/g3doc/xla_gpu_codegen.md
new file mode 100644
index 00000000000..06c55abf1fa
--- /dev/null
+++ b/tensorflow/compiler/mlir/g3doc/xla_gpu_codegen.md
@@ -0,0 +1,265 @@
+# MLIR CodeGen for XLA
+
+<!--*
+# Document freshness: For more information, see go/fresh-source.
+freshness: { owner: 'timshen' reviewed: '2020-06-16' }
+*-->
+
+XLA operates on `HloInstruction` and performs many optimizations on this
+representation, sharing a lot of these between targeted devices. As some point a
+linear schedule is computed and the memory buffer is assigned to each value
+statically. The device specific codegen operates by traversing this sequence and
+calling "emitters" to generate a representation suitable for the device (for
+example a single LLVM function per XLA computation on CPU, or a sequence of
+"thunks" encapsulating GPU operations and possibly generated PTX when targeting
+GPU).
+
+As a staging step, we're currently in the process of intercepting the process
+right after XLA completes the buffer-assignment phase and emit instead an MLIR
+module in the `lhlo` dialect. From there we perform the codegen using MLIR
+components (Linalg, affine, and GPU dialect mainly) depending on the device.
+
+Below is the plan of record to incrementally migrate XLA/GPU by using `lhlo` as
+the codegen input.
+
+## Tasks
+
+              | Host                     | Device
+------------- | ------------------------ | ------------------------
+Input format  | HloInstruction* (Task 1) | HloInstruction* (Task 1)
+Output format | xla::Thunk (Task 2)      | LLVM IR (Task 3)
+
+*   **Task 1** changes both host and device input format from HloInstruction* to
+    LHLO.
+*   **Task 2** changes output format of host from thunks to "some landing pad
+    for host" (see below).
+*   **Task 3** migrates device output from LLVM IR to some form of MLIR. It's
+    optional to this project, and see the section "Migrating Device LLVM IR" for
+    details.
+
+This project prioritizes having end-to-end runnable models with LHLO-emitters
+enabled as much as possible. This implies that the following order list of
+objectives by priority:
+
+*   Make XLA/GPU runnable with LHLO emitters, with existing Thunks and emitters
+    unmodified.
+*   Eliminate the references to HloInstruction\* in LHLO, case by case:
+    *   Switch a legacy emitter to an MLIR-based emitter (e.g. Linalg), or
+    *   Mechanically translate the existing emitter to take MLIR representation
+        (migrate to Standard with GPU Dialect).
+
+## Migrating Thunks (Task 2)
+
+xla::gpu::Thunk is a data structure that:
+
+*   Can be called into from the host (xla::gpu::Thunk::ExecuteOnStream()).
+*   Carries various data in its subclasses.
+*   Interacts with BufferAllocation::Slice and StreamExecutor.
+*   Launches kernels
+*   Calls into all runtime libraries.
+
+The cost of that includes:
+
+*   Representing op-specific configuration data (e.g. convolution configs).
+*   Migrating op shape and operand shapes.
+*   Representing a tree of thunks (while, condition, etc).
+
+The migration work is independent from LHLO / emitter migration. Under limited
+resources, it's prioritized behind LHLO / emitter migration.
+
+We have several choices on how to lower the host-side part from LHLO:
+
+*   TFRT
+    *   (Pro) great CUDA and HIP wrappers for use.
+    *   (Pro) easy to implement library calls (cuDNN, cuBLAS, cuFFT, etc), as
+        TFRT ops are interpreted by C++ code.
+    *   (Con) host side is under development and not tested.
+    *   (Con) the JAX integration isn’t clear from a runtime point of view
+*   Jitted CPU code
+    *   (Pro) great lower-ability. Create a few loops and conditions and it's
+        done.
+    *   (Con) GPUDialect doesn't yet model chains/streams/asynchronicity/device
+        allocation.
+    *   (Con) CUDA / HIP runtime support is minimal (toolkit path, version,
+        dynamic loading, etc).
+*   Existing (interpreting) XLA runtime
+
+Tentative conclusion: Use jitted CPU code during the transition, and optionally
+adopt TFRT in the end.
+
+## Migrating Device LLVM IR (Task 3)
+
+An elemental emitter generates target op by filling it element by element. Each
+output element depends on a set of elements from the operands. All elements are
+described by combining the buffer with dynamic indices. It's sufficient to
+describe almost all "math" ops, but for performance reasons only a large subset
+of "math" ops are implemented directly in (Cpu|Gpu)ElementalIrEmitter.
+
+ElementalIrEmitter is unique in that:
+
+*   A large portion of the code is shared between XLA/GPU and CPU.
+*   It represents a large portion of ops seen in models, including all
+    element-wise ops.
+*   Most fusions solely depend on ElementalIrEmitter.
+*   It's structurally simple, as it describes a data dependency DAG between op
+    elements and operand elements.
+*   It's mostly portable and high-level (e.g. unlike GPU kReduce and GPU kCopy).
+*   Dynamic shape support is easy for at least element-wise ops.
+
+Now, for all ops, elementally-emitted or not, there are several flavors of the
+end state of each XLA op:
+
+1.  Device code stays as LLVM IR.
+1.  Refactor the old emitter to be like LHLO -> MLIR LLVM Dialect:
+    *   (Cost) Will be throw-away work if we want to ultimately migrate to
+        Standard.
+    *   (Benefit) It is easy and mechanical. Can be done in a short period.
+    *   (Benefit) It doesn't benefit more compared to a).
+1.  Refactor old emitters to be like LHLO -> MLIR GPU + Standard + Loops:
+    *   (Cost) Lifting existing emitters to Standard introduces some challenges.
+        Pointers and GEPs need to be converted to MemRefs and SubViews. Ensuring
+        amdgpu completeness is another one.
+    *   (Cost) XLA/GPU heavily relies on LLVM metadata:
+        *   `range` for block/thread indices.
+        *   `align`, `dereferenceable`, `invariant.load`, `alias.scope`,
+            `noalias` for load/stores.
+        *   `llvm.loop.unroll.disable`, `llvm.loop.unroll.full`,
+            `llvm.loop.vectorize.enable` for sequential loops.
+    *   (Benefit) Can be long-term. More portable.
+1.  Refactor old emitters to be LHLO -> Linalg, and write new Linalg emitters
+    *   (Cost) This is case by case. Compared to previous options, a new
+        implementation that matches XLA's performance needs to go through the
+        benchmark <-> optimize workflow, which can be a significant cost for
+        some ops.
+    *   (Benefit) unified stack; community support; portability; more
+        optimization potentials.
+
+## Prioritization
+
+While all three tasks mentioned above are parallelizable, under limited
+resources they have to be serialized. The prioritization focuses on visible
+results for completion of each task.
+
+The prioritization is: Task1 (LHLO for legacy emitters) > Task 2 (Thunks) > Task
+3 (MLIR emitters).
+
+By the end of Task 1, users of XLA can generate an LHLO (e.g. kernel generator)
+and execute them. The compilation format will not be serializable MLIR.
+
+By the end of Task 2, LHLO lowers to proper, serializable MLIR. This enables
+offline compilation.
+
+By the end of Task 3, all XLA emitters are MLIR-based in its implementation.
+
+## Detailed Design
+
+### Step 1: (Task 1) Complete LHLO and Make Legacy Emitters Take LHLO
+
+This step makes all existing XLA/GPU emitters interact with MLIR ops. This step
+is pure refactoring and NFC.
+
+This step is mostly mechanical, but it's worth noticing the following
+discrepancies between an unnested HloComputation and LHLO:
+
+*   Each HloInstruction has direct access to its operands (a data-flow DAG). On
+    contrary, each LHLO op only has access to its operand buffers (a bipartite
+    between ops and buffers). LHLO ops have to go through use-def chains to
+    access their operand ops.
+*   Unnested legacy emitters empirically almost never access their operands. The
+    only exception is kReduce.
+*   Unnested legacy emitters access BufferAssignment only for getting slices,
+    not for accessing aux data structures like dataflow\_analysis() or
+    alias\_analysis(). llvm\_ir builds its own alias\_analysis() based on slice
+    information.
+
+The conclusion is that LHLO should fit right-in without major hassle.
+
+### Step 2: (Optional) Profiling Support
+
+**This step is only needed if we start to discard some of the XLA Thunk logic
+(see the next step).**
+
+Before actually turning on any MLIR-based emitters, we need profiling for
+MLIR-based emitters.
+
+Currently XLA performs its own profiling by calling into StreamExecutor's timer.
+The timer under the hood inserts two events before and after a kernel launch,
+and measures the sync time between these two events.
+
+There are roughly three approaches to support profiling in MLIR:
+
+*   Run a profiler end-to-end
+*   Add a profile op for each op in LHLO, using an injected profiler.
+
+The "end-to-end" approach is transparent to MLIR, but suffers the same problem
+that makes XLA not use it in the first place: library calls collected by a
+profiler (nvprof/...) can't easily relate to HLO ops. For example, cuDNN
+launches multiple kernels for each HLO, and it's hard to tell which kernels
+correspond to which HLO.
+
+The "injected profiler" approach requires:
+
+*   LHLO to take a profiler as a parameter.
+*   inserting profile.start / profile.end before and after each op.
+*   a pass from that lowers profile.{start,end} to a C++ implementation.
+
+The exact profiling can't be easily done for MLIR-generated ops, since:
+
+*   MLIR doesn't have a timer, nor it depends on TFRT / StreamExecutor.
+*   MLIR doesn't easily call into C functions with complicated parameters.
+
+### Step 3: (Task 2) Migrating Thunks
+
+This step migrates all host ops and library calls. This step will eliminate most
+of the thunks and produce serializable MLIR instead.
+
+There are roughly three kinds of thunks:
+
+*   KernelThunk, which launches a kernel.
+*   Control flow thunks, which has host control flow logic (conditional, while,
+    for, sequence) and launch body kernels.
+*   Library thunks: cuDNN, cuBLAS, cuFFT, NCCL, etc.
+
+The **bottom line** is to:
+
+*   Create a Thunk dialect that provides (de)serialize logic for all existing
+    C++-based Thunks.
+*   Change emitters to emit a graph of Thunk dialect.
+
+**Optionally**, we can relieve some thunks from C++ implementation. KernelThunk
+can lower to the GPU LaunchKernelOp. Control flow thunks can leverage the CFG
+Dialect for loops and conditions, combined with LaunchKernelOp. This optional
+step requires profiling and stream support.
+
+### Step 4: (Task 3) Migrated ElementalIrEmitter
+
+Once profiling is ready, we can complete and tune all ElementalIrEmitter-based
+emitters in MLIR. Then we turn them on by default, assuming that all of these
+MLIR-based emitters use a single stream.
+
+Notice that it's beneficial to migrate XLA/CPU's ElementalIrEmitter as well,
+since they share a large portion of the code.
+
+With all benchmarking and performance hunting done (TODO: define performance
+parity), we turn on the new MLIR-based elemental emitter, and delete the legacy
+ElementalIrEmitter.
+
+This step also provides easy fusion transitions (nested ops) for the later
+migration.
+
+### Step 5: Multi-Stream Support or Drop
+
+We can't delete
+[some of the emitters](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/compiler/xla/service/gpu/stream_assignment.cc#L140)
+until we support it in MLIR, or we drop the feature. It's a relatively large
+amount of work in MLIR and a small amount of gain for XLA. We should investigate
+current users of multi-stream XLA/GPU users, and try to delete this feature if
+reasonable.
+
+### Step 6: (Task 3) Migrated Device Ops
+
+This step migrates all unnested ops, then we can delete all unnested emitters.
+
+This calls on a rewrite/refactor for kCopy and kReduce. kReduce is already
+worked on for plenty, so the actual amount of work that needs to be done remains
+to be seen.
diff --git a/tensorflow/compiler/mlir/lite/BUILD b/tensorflow/compiler/mlir/lite/BUILD
index 8e9d615053c..8d4efeb3d60 100644
--- a/tensorflow/compiler/mlir/lite/BUILD
+++ b/tensorflow/compiler/mlir/lite/BUILD
@@ -314,7 +314,6 @@ tf_cc_test(
 cc_library(
     name = "tensorflow_lite_legalize_tf",
     srcs = [
-        "transforms/device_index_selector.cc",
         "transforms/dilated_conv.cc",
         "transforms/generated_legalize_tf.inc",
         "transforms/generated_lower_static_tensor_list.inc",
diff --git a/tensorflow/compiler/mlir/lite/converter_gen.cc b/tensorflow/compiler/mlir/lite/converter_gen.cc
index 6df569a8031..edead2037a3 100644
--- a/tensorflow/compiler/mlir/lite/converter_gen.cc
+++ b/tensorflow/compiler/mlir/lite/converter_gen.cc
@@ -446,7 +446,7 @@ static void GenOperandResultVerifier(raw_ostream &os,
     auto desc =
         definit->getDef()->getValueAsString("tflRuntimeTypeDescription");
 
-    // Emit a loop to check all the dynamic values in the pack.
+    // Emit a loop to check all operands.
     os << formatv("    for (Value v : top.getODS{0}{1}s({2})) {{\n",
                   // Capitalize the first letter to match the function name
                   valueKind.substr(0, 1).upper(), valueKind.substr(1),
@@ -455,14 +455,10 @@ static void GenOperandResultVerifier(raw_ostream &os,
     os << "      (void)v;\n"
        << "      if (!("
        << tgfmt(pred.getCondition(), &fctx.withSelf("v.getType()")) << ")) {\n"
-       << "        if (failure_on_operand_type_mismatch) {\n"
        << formatv(
-              "        return op->emitOpError(\"{0} #\") << index "
+              "      return op->emitOpError(\"{0} #\") << index "
               "<< \" must be {1}, but got \" << v.getType();\n",
               valueKind, desc)
-       << "        } else {\n"
-       << "          return ::mlir::LogicalResult::Failure;\n"
-       << "        }\n"
        << "      }\n"  // if
        << "      ++index;\n"
        << "    }\n";  // for
@@ -487,8 +483,7 @@ static bool RuntimeVerifierWriterMain(raw_ostream &os, RecordKeeper &records) {
 
     mlir::tblgen::FmtContext verify_ctx;
     os << "::mlir::LogicalResult " << op.getCppClassName()
-       << "::VerifyTflRuntimeConstraints(::mlir::Operation *op, bool "
-          "failure_on_operand_type_mismatch) {\n";
+       << "::VerifyTflRuntimeConstraints(::mlir::Operation *op) {\n";
     os << "  auto top = cast<" << op.getCppClassName() << ">(op); (void)top;\n";
     verify_ctx.withOp("top");
 
@@ -529,11 +524,8 @@ static bool RuntimeVerifierWriterMain(raw_ostream &os, RecordKeeper &records) {
 
       mlir::tblgen::Pred pred(dyn_cast<llvm::DefInit>(val->getValue()));
       os << tgfmt(
-          "  if (!($0)) {\n    "
-          "    if (failure_on_operand_type_mismatch) {\n"
-          "      return top.emitOpError(\"failed to verify that $1\");\n"
-          "    } else {\n"
-          "      return ::mlir::LogicalResult::Failure;\n  }\n  }\n",
+          "  if (!($0))\n"
+          "    return top.emitOpError(\"failed to verify that $1\");\n",
           &verify_ctx, tgfmt(pred.getCondition(), &verify_ctx), desc);
     }
     os << "  return top.verify();\n}\n";
diff --git a/tensorflow/compiler/mlir/lite/flatbuffer_export.cc b/tensorflow/compiler/mlir/lite/flatbuffer_export.cc
index a260670015a..e34e7ae7ca6 100644
--- a/tensorflow/compiler/mlir/lite/flatbuffer_export.cc
+++ b/tensorflow/compiler/mlir/lite/flatbuffer_export.cc
@@ -240,10 +240,10 @@ static bool IsValidTFLiteMlirModule(ModuleOp module) {
   }
 
   for (auto fn : module.getOps<FuncOp>()) {
-    if (fn.getBlocks().size() != 1) {
+    if (!llvm::hasSingleElement(fn)) {
       return fn.emitError("should have exactly one basic block"), false;
     }
-    auto& bb = fn.getBlocks().front();
+    auto& bb = fn.front();
 
     for (auto arg : bb.getArguments()) {
       if (!HasValidTFLiteType(arg, fn))
@@ -1089,7 +1089,7 @@ void Translator::InitializeNamesFromAttribute(FuncOp fn, bool* has_input_attr) {
           dict_attr.get("outputs").dyn_cast_or_null<mlir::StringAttr>()) {
     str.getValue().split(output_names, ',', /*MaxSplit=*/-1,
                          /*KeepEmpty=*/false);
-    auto term = fn.getBlocks().back().getTerminator();
+    auto term = fn.back().getTerminator();
     if (output_names.size() != term->getNumOperands()) {
       fn.emitWarning() << "output names (" << output_names.size()
                        << ") != terminator operands (" << term->getNumOperands()
diff --git a/tensorflow/compiler/mlir/lite/ir/tfl_op_interfaces.td b/tensorflow/compiler/mlir/lite/ir/tfl_op_interfaces.td
index 23101113a6f..a79d79b5970 100644
--- a/tensorflow/compiler/mlir/lite/ir/tfl_op_interfaces.td
+++ b/tensorflow/compiler/mlir/lite/ir/tfl_op_interfaces.td
@@ -94,8 +94,7 @@ def TFL_RuntimeVerification : OpInterface<"TflRuntimeVerifyOpInterface"> {
   let methods = [
     StaticInterfaceMethod<
       [{Returns whether the op's operands/results are supported by runtime.}],
-      "LogicalResult", "VerifyTflRuntimeConstraints",
-      (ins "Operation*":$op, "bool":$failure_on_operand_type_mismatch)
+      "LogicalResult", "VerifyTflRuntimeConstraints", (ins "Operation*":$op)
     >,
   ];
 }
diff --git a/tensorflow/compiler/mlir/lite/ir/tfl_ops.cc b/tensorflow/compiler/mlir/lite/ir/tfl_ops.cc
index 6e9930271c8..16d256c7571 100644
--- a/tensorflow/compiler/mlir/lite/ir/tfl_ops.cc
+++ b/tensorflow/compiler/mlir/lite/ir/tfl_ops.cc
@@ -758,6 +758,22 @@ OpFoldResult ConcatenationOp::fold(ArrayRef<Attribute> operands) {
   return new_concat.getResult();
 }
 
+//===----------------------------------------------------------------------===//
+// CustomOp
+//===----------------------------------------------------------------------===//
+
+static LogicalResult Verify(CustomOp op) {
+  OpaqueElementsAttr opaque_attr =
+      op.custom_option().cast<OpaqueElementsAttr>();
+  if (!opaque_attr.getType().hasStaticShape())
+    return op.emitOpError("custom_option should have a static shape.");
+  if (opaque_attr.getValue().size() !=
+      opaque_attr.getType().cast<ShapedType>().getDimSize(0))
+    return op.emitOpError(
+        "custom_option should have the same length of content with shape.");
+  return success();
+}
+
 //===----------------------------------------------------------------------===//
 // FullyConnectedOp
 //===----------------------------------------------------------------------===//
@@ -2169,6 +2185,10 @@ static LogicalResult Verify(TransposeOp op) {
   return success();
 }
 
+//===----------------------------------------------------------------------===//
+// WhileOp
+//===----------------------------------------------------------------------===//
+
 LogicalResult Verify(WhileOp op) {
   if (op.getNumOperands() != op.getNumResults())
     return op.emitOpError(llvm::formatv(
@@ -2178,18 +2198,6 @@ LogicalResult Verify(WhileOp op) {
   return success();
 }
 
-static LogicalResult Verify(CustomOp op) {
-  OpaqueElementsAttr opaque_attr =
-      op.custom_option().cast<OpaqueElementsAttr>();
-  if (!opaque_attr.getType().hasStaticShape())
-    return op.emitOpError("custom_option should have a static shape.");
-  if (opaque_attr.getValue().size() !=
-      opaque_attr.getType().cast<ShapedType>().getDimSize(0))
-    return op.emitOpError(
-        "custom_option should have the same length of content with shape.");
-  return success();
-}
-
 namespace {
 // Canonicalize While op so that results and operands match and external values
 // are via implicit capture rather than via block args.
diff --git a/tensorflow/compiler/mlir/lite/ir/tfl_ops.td b/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
index 509c13ae161..f379b241f9d 100644
--- a/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
+++ b/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
@@ -571,6 +571,8 @@ def TFL_TransposeConvOp: TFL_Op<"transpose_conv", [
     TFL_OperandHasRank<2, 4>,
     PredOpTrait<"input and output must have same element type",
       TFL_TCresVTEtIsSameAsOp<0, 2>>,
+    AccumulatorUniformScale<3, 1, 2>,
+    TFL_ChannelDimIndexInterface, AffineOpCoefficient<0, 2>,
     TFL_GpuTargetOp,
     TFL_SparseOp]> {
   let summary = "Transpose convolution operator";
@@ -596,6 +598,8 @@ def TFL_TransposeConvOp: TFL_Op<"transpose_conv", [
   let verifier = [{ return Verify(*this); }];
 
   let extraClassDeclaration = [{
+    // ChannelDimIndexInterface:
+    int GetChannelDimIndex() { return 0; }
     // SparseOpInterface:
     std::vector<int> GetSparseOperands() { return {1}; }
     std::vector<std::vector<int>> GetFloatBlockSize() { return {}; }
@@ -953,14 +957,14 @@ in the batch dimensions and broadcasting.
   }];
 
   let arguments = (ins
-    TFL_TensorOf<[F32]>:$x,
-    TFL_TensorOf<[F32]>:$y,
+    TFL_TensorOf<[F32, QI8]>:$x,
+    TFL_TensorOf<[F32, QI8]>:$y,
     DefaultValuedAttr<BoolAttr, "false">:$adj_x,
     DefaultValuedAttr<BoolAttr, "false">:$adj_y
   );
 
    let results = (outs
-    TFL_TensorOf<[F32]>:$output
+    TFL_TensorOf<[F32, QI8]>:$output
   );
 
   let hasOptions = 1;
diff --git a/tensorflow/compiler/mlir/lite/quantization/import_quant_stats_pass.cc b/tensorflow/compiler/mlir/lite/quantization/import_quant_stats_pass.cc
index d924a3e82ac..6299a70b1df 100644
--- a/tensorflow/compiler/mlir/lite/quantization/import_quant_stats_pass.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/import_quant_stats_pass.cc
@@ -76,7 +76,8 @@ class ImportQuantStatsPass
   // If the index is out of range, this method returns false. Otherwise it
   // returns true if the value is a float tensor.
   bool IsQuantizableResult(Operation *op, int index) {
-    if (index < 0 || index >= op->getNumResults()) return false;
+    if (index < 0 || index >= static_cast<int>(op->getNumResults()))
+      return false;
     Value res = op->getResult(index);
     return res.getType().isa<ShapedType>() &&
            res.getType().cast<ShapedType>().getElementType().isa<FloatType>();
@@ -158,7 +159,7 @@ void ImportQuantStatsPass::ImportAsStatsOps(OpBuilder b, Operation *op,
     InsertStatsOpAtResult(b, op->getResult(index), layer_stats, axis_stats,
                           axis);
   } else {
-    for (int i = 0; i < op->getNumResults(); ++i) {
+    for (int i = 0, e = op->getNumResults(); i < e; ++i) {
       if (IsQuantizableResult(op, i)) {
         InsertStatsOpAtResult(b, op->getResult(i), layer_stats, axis_stats,
                               axis);
diff --git a/tensorflow/compiler/mlir/lite/quantization/quantization_config.cc b/tensorflow/compiler/mlir/lite/quantization/quantization_config.cc
index 3edd9c36760..9adabde4f25 100644
--- a/tensorflow/compiler/mlir/lite/quantization/quantization_config.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/quantization_config.cc
@@ -48,7 +48,7 @@ bool ParseInputNodeQuantSpecs(absl::string_view node_names,
   std::vector<llvm::Optional<double>> node_mins;
   if (!min_values.empty()) {
     std::vector<std::string> node_mins_str = absl::StrSplit(min_values, ',');
-    for (int i = 0; i < node_mins_str.size(); i++) {
+    for (int i = 0, e = node_mins_str.size(); i < e; i++) {
       double value;
       if (!absl::SimpleAtod(node_mins_str[i], &value)) {
         return true;
@@ -60,7 +60,7 @@ bool ParseInputNodeQuantSpecs(absl::string_view node_names,
   std::vector<llvm::Optional<double>> node_maxs;
   if (!max_values.empty()) {
     std::vector<std::string> node_maxs_str = absl::StrSplit(max_values, ',');
-    for (int i = 0; i < node_maxs_str.size(); i++) {
+    for (int i = 0, e = node_maxs_str.size(); i < e; i++) {
       double value;
       if (!absl::SimpleAtod(node_maxs_str[i], &value)) {
         llvm::errs() << "Unexpected mins: " << node_maxs_str[i] << "\n";
diff --git a/tensorflow/compiler/mlir/lite/quantization/quantization_driver.cc b/tensorflow/compiler/mlir/lite/quantization/quantization_driver.cc
index 89443b1ec65..f3e746c7a43 100644
--- a/tensorflow/compiler/mlir/lite/quantization/quantization_driver.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/quantization_driver.cc
@@ -294,7 +294,7 @@ class QuantizationDriver {
         return;
       if (current_op == op) llvm::errs() << "===>>>";
       llvm::errs() << op->getName() << " : (";
-      for (auto i = 0; i < op->getNumOperands(); ++i) {
+      for (int i = 0, e = op->getNumOperands(); i < e; ++i) {
         if (auto params = GetOperandQuantState(op, i).params)
           params.print(llvm::errs());
         else
@@ -303,7 +303,7 @@ class QuantizationDriver {
         llvm::errs() << ",";
       }
       llvm::errs() << ") -> (";
-      for (auto i = 0; i < op->getNumResults(); ++i) {
+      for (int i = 0, e = op->getNumResults(); i < e; ++i) {
         if (auto params = GetResultQuantState(op, i).params)
           params.print(llvm::errs());
         else
diff --git a/tensorflow/compiler/mlir/lite/quantization/quantization_utils.cc b/tensorflow/compiler/mlir/lite/quantization/quantization_utils.cc
index 32f68aaae5f..b98739eac6e 100644
--- a/tensorflow/compiler/mlir/lite/quantization/quantization_utils.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/quantization_utils.cc
@@ -55,7 +55,7 @@ static Type GetQuantizedType(Builder builder, Type input_type,
   } else if (min.size() == max.size()) {
     auto shape = input_type.dyn_cast<ShapedType>();
     if (!shape || shape.getRank() <= quant_dim ||
-        min.size() != shape.getDimSize(quant_dim)) {
+        static_cast<int64_t>(min.size()) != shape.getDimSize(quant_dim)) {
       return {};
     }
     // TODO(b/141508873): the quantization dim is set to the last dimension.
@@ -76,7 +76,8 @@ TypeAttr RescaleQuantizedType(Type input, Attribute factor) {
   if (auto qtype = ele_type.dyn_cast<quant::UniformQuantizedPerAxisType>()) {
     ArrayRef<double> scales = qtype.getScales();
     // Broadcasting hasn't been implemented yet.
-    if (scales.size() != factor_values.getNumElements()) return {};
+    if (static_cast<int64_t>(scales.size()) != factor_values.getNumElements())
+      return {};
     SmallVector<double, 4> new_scales;
     new_scales.reserve(scales.size());
     auto scales_iter = scales.begin();
@@ -270,7 +271,7 @@ Type GetUniformQuantizedPerAxisTypeForWeight(ElementsAttr attr, int quant_dim,
                                              bool narrow_range) {
   Builder builder(attr.getContext());
   auto shape = attr.getType().cast<ShapedType>().getShape();
-  if (shape.size() <= quant_dim) return {};
+  if (static_cast<int>(shape.size()) <= quant_dim) return {};
   // `symmetric` can only be used when it is `signed` and `narrow_range`.
   if (symmetric && (!is_signed || !narrow_range)) return {};
 
@@ -335,7 +336,7 @@ quant::QuantizedType GetUniformQuantizedTypeForBias(
     const std::vector<quant::QuantizedType>& op_types) {
   if (op_types.empty()) return {};
 
-  int axis_size = 1;
+  size_t axis_size = 1;
   int32_t quant_dim = -1;
   Type expressed_type;
   // Requires all the op types are valid UniformQuantizedTypes or
@@ -369,7 +370,7 @@ quant::QuantizedType GetUniformQuantizedTypeForBias(
         scales[index_scale.index()] *= index_scale.value();
       }
     } else if (auto type = op_type.dyn_cast<quant::UniformQuantizedType>()) {
-      for (int index = 0; index != axis_size; ++index) {
+      for (int index = 0, e = axis_size; index != e; ++index) {
         scales[index] *= type.getScale();
       }
     }
diff --git a/tensorflow/compiler/mlir/lite/tests/legalize-tf.mlir b/tensorflow/compiler/mlir/lite/tests/legalize-tf.mlir
index 1ae789f5468..5756fa6dec2 100644
--- a/tensorflow/compiler/mlir/lite/tests/legalize-tf.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/legalize-tf.mlir
@@ -990,6 +990,13 @@ func @batch_to_space_nd(%arg0: tensor<4x2x2x3xf32>, %arg1: tensor<2xi32>, %arg2:
   // CHECK: "tfl.batch_to_space_nd"(%arg0, %arg1, %arg2) : (tensor<4x2x2x3xf32>, tensor<2xi32>, tensor<2x2xi32>) -> tensor<?xf32>
 }
 
+func @batch_to_space_nd_unsupported(%arg0: tensor<?x1x1x1x4xf32>, %arg1: tensor<3xi32>, %arg2: tensor<3x2xi32>) -> tensor<?x3x3x3x4xf32> {
+  %0 = "tf.BatchToSpaceND"(%arg0, %arg1, %arg2) : (tensor<?x1x1x1x4xf32>, tensor<3xi32>, tensor<3x2xi32>) -> tensor<?x3x3x3x4xf32>
+  return %0 : tensor<?x3x3x3x4xf32>
+  // CHECK-LABEL: batch_to_space_nd_unsupported
+  // CHECK: "tf.BatchToSpaceND"
+}
+
 func @space_to_batch_nd(%arg0: tensor<1x4x4x3xf32>, %arg1: tensor<2xi32>, %arg2: tensor<2x2xi32>) -> tensor<?xf32> {
   %0 = "tf.SpaceToBatchND"(%arg0, %arg1, %arg2) : (tensor<1x4x4x3xf32>, tensor<2xi32>, tensor<2x2xi32>) -> tensor<?xf32>
   return %0 : tensor<?xf32>
diff --git a/tensorflow/compiler/mlir/lite/tests/prepare-quantize-signed.mlir b/tensorflow/compiler/mlir/lite/tests/prepare-quantize-signed.mlir
index e1f496b91f4..4a83616408e 100644
--- a/tensorflow/compiler/mlir/lite/tests/prepare-quantize-signed.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/prepare-quantize-signed.mlir
@@ -70,6 +70,7 @@ func @prepareAdd(%arg0: tensor<2x2xf32>) -> tensor<2x2xf32> {
 }
 
 // CHECK-LABEL: prepareConv2DSplat
+// PerTensor-LABEL: prepareConv2DSplat
 func @prepareConv2DSplat(%arg0: tensor<1x5x5x3xf32>) -> tensor<1x5x5x3xf32> {
   %w = constant dense<127.0> : tensor<3x3x3x3xf32>
   %b = constant dense<0.0> : tensor<3xf32>
@@ -89,6 +90,7 @@ func @prepareConv2DSplat(%arg0: tensor<1x5x5x3xf32>) -> tensor<1x5x5x3xf32> {
 }
 
 // CHECK-LABEL: prepareConv2D
+// PerTensor-LABEL: prepareConv2D
 func @prepareConv2D(%arg0: tensor<1x5x5x1xf32>) -> tensor<1x5x5x3xf32> {
   %w = constant dense<[[[[0.0]]], [[[127.0]]], [[[-127.0]]]]> : tensor<3x1x1x1xf32>
   %b = constant dense<0.0> : tensor<3xf32>
@@ -108,6 +110,7 @@ func @prepareConv2D(%arg0: tensor<1x5x5x1xf32>) -> tensor<1x5x5x3xf32> {
 }
 
 // CHECK-LABEL: prepareDepthwiseConv2D
+// PerTensor-LABEL: prepareDepthwiseConv2D
 func @prepareDepthwiseConv2D(%arg0: tensor<1x224x224x3xf32>) -> tensor<1x112x112x32xf32> {
   %w = constant dense<127.0> : tensor<32x3x3x3xf32>
   %b = constant dense<0.0> : tensor<32xf32>
@@ -127,6 +130,7 @@ func @prepareDepthwiseConv2D(%arg0: tensor<1x224x224x3xf32>) -> tensor<1x112x112
 }
 
 // CHECK-LABEL: QuantizeFullyConnected
+// PerTensor-LABEL: QuantizeFullyConnected
 func @QuantizeFullyConnected(%arg0: tensor<1x224x224x3xf32>) -> tensor<1x112x112x32xf32> {
   %w = constant dense<127.0> : tensor<32x12xf32>
   %b = constant dense<0.0> : tensor<32xf32>
@@ -143,3 +147,22 @@ func @QuantizeFullyConnected(%arg0: tensor<1x224x224x3xf32>) -> tensor<1x112x112
 // PerTensor: %[[dq:.*]] = "tfl.dequantize"(%0) : (tensor<32x12x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>) -> tensor<32x12xf32>
 // PerTensor: "tfl.fully_connected"(%arg0, %[[dq]]
 }
+
+// CHECK-LABEL: QuantizeTransposeConv
+// PerTensor-LABEL: QuantizeTransposeConv
+func @QuantizeTransposeConv(%arg0: tensor<32x4x4x128xf32>, %arg1: tensor<4xi32>) -> tensor<1x32x42x128xf32> {
+  %w = constant dense<127.0> : tensor<1x32x42x128xf32>
+  %b = constant dense<0.0> : tensor<1x32x42x128xf32>
+  %tc = "tfl.transpose_conv"(%arg1, %arg0, %w, %b) {padding = "SAME", stride_h = 2 : i32, stride_w = 2 : i32} : (tensor<4xi32>, tensor<32x4x4x128xf32>, tensor<1x32x42x128xf32>, tensor<1x32x42x128xf32>) -> tensor<1x32x42x128xf32>
+  return %tc : tensor<1x32x42x128xf32>
+
+// CHECK: %[[CST:.*]] = constant dense<1.270000e+02> : tensor<1x32x42x128xf32>
+// CHECK: %[[QUANTIZE:.*]] = "tfl.quantize"(%[[CST]]) {qtype = tensor<1x32x42x128x!quant.uniform<i8<-127:127>:f32:0, {1.000000e+00}>>, volatile}
+// CHECK: %[[DEQUANTIZE:.*]] = "tfl.dequantize"(%[[QUANTIZE]]) : (tensor<1x32x42x128x!quant.uniform<i8<-127:127>:f32:0, {1.000000e+00}>>) -> tensor<1x32x42x128xf32>
+// CHECK: "tfl.transpose_conv"(%arg1, %arg0, %[[DEQUANTIZE]]
+
+// PerTensor: %[[CST:.*]] = constant dense<1.270000e+02> : tensor<1x32x42x128xf32>
+// PerTensor: %[[QUANTIZE:.*]] = "tfl.quantize"(%[[CST]]) {qtype = tensor<1x32x42x128x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>, volatile}
+// PerTensor: %[[DEQUANTIZE:.*]] = "tfl.dequantize"(%[[QUANTIZE]]) : (tensor<1x32x42x128x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>) -> tensor<1x32x42x128xf32>
+// PerTensor: "tfl.transpose_conv"(%arg1, %arg0, %[[DEQUANTIZE]]
+}
diff --git a/tensorflow/compiler/mlir/lite/tests/prepare-tf.mlir b/tensorflow/compiler/mlir/lite/tests/prepare-tf.mlir
index e95f3d011e2..719430959d0 100644
--- a/tensorflow/compiler/mlir/lite/tests/prepare-tf.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/prepare-tf.mlir
@@ -528,6 +528,26 @@ func @PadStridedSliceNewAxisMask2(%arg0: tensor<4x64x64x1xf32>) -> tensor<1x4x64
   return %1 : tensor<1x4x64x64xf32>
 }
 
+// CHECK-LABEL: @StridedSliceRewriteMasks
+func @StridedSliceRewriteMasks(%arg0: tensor<8x4x16x2xf32>) -> tensor<8x4x16x1xf32> {
+  %cst = "tf.Const"() {device = "", value = dense<[1, 0, 1]> : tensor<3xi32>} : () -> tensor<3xi32>
+  %cst_0 = "tf.Const"() {device = "", value = dense<[1, 0, 0]> : tensor<3xi32>} : () -> tensor<3xi32>
+  %cst_1 = "tf.Const"() {device = "", value = dense<1> : tensor<3xi32>} : () -> tensor<3xi32>
+
+  // CHECK: %[[CST:.*]] = constant dense<[1, 0, 0, 1]> : tensor<4xi32>
+  // CHECK: %[[CST0:.*]] = constant dense<[1, 0, 0, 0]> : tensor<4xi32>
+  // CHECK: %[[CST1:.*]] = constant dense<1> : tensor<4xi32>
+  // CHECK: %[[RESULT:.*]] = "tf.StridedSlice"(%arg0, %[[CST]], %[[CST0]], %[[CST1]])
+  // CHECK-SAME: begin_mask = 7 : i64
+  // CHECK-SAME: ellipsis_mask = 0 : i64
+  // CHECK-SAME: end_mask = 14 : i64
+  // CHECK-SAME: new_axis_mask = 0 : i64
+  // CHECK-SAME: shrink_axis_mask = 0 : i64
+
+  %0 = "tf.StridedSlice"(%arg0, %cst, %cst_0, %cst_1) {begin_mask = 1 : i64, device = "", ellipsis_mask = 2 : i64, end_mask = 4 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<8x4x16x2xf32>, tensor<3xi32>, tensor<3xi32>, tensor<3xi32>) -> tensor<8x4x16x1xf32>
+  return %0 : tensor<8x4x16x1xf32>
+}
+
 // CHECK-LABEL: @MatrixSetDiagV2Conversion
 func @MatrixSetDiagV2Conversion(%arg0: tensor<3x3xi32>, %arg1: tensor<3xi32>) -> tensor<3x3xi32> {
   %cst = constant dense<0> : tensor<i32>
diff --git a/tensorflow/compiler/mlir/lite/tf_tfl_passes.cc b/tensorflow/compiler/mlir/lite/tf_tfl_passes.cc
index 589515d6246..3fa2eae42f2 100644
--- a/tensorflow/compiler/mlir/lite/tf_tfl_passes.cc
+++ b/tensorflow/compiler/mlir/lite/tf_tfl_passes.cc
@@ -39,22 +39,18 @@ namespace tensorflow {
 void AddQuantizationPasses(const mlir::TFL::QuantizationSpecs& quant_specs,
                            mlir::OpPassManager* pass_manager) {
   pass_manager->addPass(mlir::TFL::CreatePrepareQuantizePass(quant_specs));
-  pass_manager->addPass(mlir::TFL::CreateQuantizePass());
-  bool emit_quant_adaptor_ops =
-      quant_specs.inference_type != quant_specs.inference_input_type;
-  pass_manager->addPass(
-      mlir::TFL::CreatePostQuantizePass(emit_quant_adaptor_ops));
-
   if (quant_specs.default_ranges.first.hasValue() ||
       quant_specs.default_ranges.second.hasValue()) {
     pass_manager->addPass(mlir::TFL::CreateDefaultQuantParamsPass(
         quant_specs.default_ranges.first.getValueOr(0.0),
         quant_specs.default_ranges.second.getValueOr(0.0),
         quant_specs.IsSignedInferenceType()));
-    pass_manager->addPass(mlir::TFL::CreateQuantizePass());
-    pass_manager->addPass(
-        mlir::TFL::CreatePostQuantizePass(emit_quant_adaptor_ops));
   }
+  pass_manager->addPass(mlir::TFL::CreateQuantizePass());
+  bool emit_quant_adaptor_ops =
+      quant_specs.inference_type != quant_specs.inference_input_type;
+  pass_manager->addPass(
+      mlir::TFL::CreatePostQuantizePass(emit_quant_adaptor_ops));
 }
 
 void AddTFToTFLConversionPasses(const mlir::TFL::PassConfig& pass_config,
@@ -63,7 +59,7 @@ void AddTFToTFLConversionPasses(const mlir::TFL::PassConfig& pass_config,
   standard_pipeline_options.enable_inliner = false;
   standard_pipeline_options.form_clusters = pass_config.form_clusters;
   mlir::TF::CreateTFStandardPipeline(*pass_manager, standard_pipeline_options);
-  pass_manager->addPass(mlir::TFL::CreateDeviceIndexSelectorPass());
+  pass_manager->addPass(mlir::TF::CreateDeviceIndexSelectorPass());
 
   if (pass_config.shape_inference) {
     pass_manager->addPass(mlir::TF::CreateTFShapeInferencePass());
@@ -212,9 +208,6 @@ void CreateTFLStandardPipeline(OpPassManager& pm,
 
   // Saved model pass to mark global tensors immutable.
   pm.addPass(mlir::tf_saved_model::CreateOptimizeGlobalTensorsPass());
-  // Used to mark non-exported functions in saved model private.
-  pm.addPass(mlir::tf_saved_model::
-                 CreateMarkFunctionVisibilityUsingSavedModelLinkagePass());
   // Op fusion pass.
   pm.addPass(mlir::TFL::CreatePrepareCompositeFunctionsPass());
 
diff --git a/tensorflow/compiler/mlir/lite/transforms/legalize_tf.cc b/tensorflow/compiler/mlir/lite/transforms/legalize_tf.cc
index 46ed134d7ee..1328a2baf5d 100644
--- a/tensorflow/compiler/mlir/lite/transforms/legalize_tf.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/legalize_tf.cc
@@ -28,9 +28,11 @@ limitations under the License.
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/StringSwitch.h"
+#include "llvm/Support/Threading.h"
 #include "mlir/Dialect/Quant/FakeQuantSupport.h"  // from @llvm-project
 #include "mlir/Dialect/Quant/UniformSupport.h"  // from @llvm-project
 #include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Diagnostics.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/IR/PatternMatch.h"  // from @llvm-project
@@ -767,13 +769,26 @@ void LegalizeTF::runOnFunction() {
             [](Operation* op) {
               auto tfl_op = dyn_cast_or_null<TflRuntimeVerifyOpInterface>(op);
               if (!tfl_op) return false;
-              return succeeded(tfl_op.VerifyTflRuntimeConstraints(
-                  tfl_op.getOperation(),
-                  /*failure_on_operand_type_mismatch=*/false));
+              return succeeded(tfl_op.VerifyTflRuntimeConstraints(op));
             }));
   } else {
     target.addLegalDialect<TensorFlowLiteDialect>();
   }
+
+  // Ignore transient errors by registering an no-op handler.
+  // Applying legalization patterns will emit unwanted, transient errors when
+  // the replaced TFLite ops do not meet the sanity checks. In order to ignore
+  // the transient errors, the following lines override a diagnostic handler
+  // with an no-op handler only while this pass runs.
+  uint64_t current_thread_id = llvm::get_threadid();
+  ScopedDiagnosticHandler scoped_diag_handler(
+      context, [&current_thread_id](Diagnostic&) -> LogicalResult {
+        // Consume only errors that are coming from the same thread in order not
+        // to ignore errors from other passes that are running. Things running
+        // in the pass manager can be multi-threaded.
+        return success(current_thread_id == llvm::get_threadid());
+      });
+
   // Keep trying to convert.
   // TODO(karimnosseir): This is similar to what apply greedy patterns does.
   // Look if there is a function that tries until it converge.
diff --git a/tensorflow/compiler/mlir/lite/transforms/passes.h b/tensorflow/compiler/mlir/lite/transforms/passes.h
index 01e5eb1cb68..105c9394fb4 100644
--- a/tensorflow/compiler/mlir/lite/transforms/passes.h
+++ b/tensorflow/compiler/mlir/lite/transforms/passes.h
@@ -91,9 +91,6 @@ std::unique_ptr<OperationPass<ModuleOp>> CreateWhileOutlinePass();
 // Verifies runtime constraints.
 std::unique_ptr<OperationPass<FuncOp>> CreateRuntimeVerifyPass();
 
-// Creates function pass to select device index/fold tf.DeviceIndex.
-std::unique_ptr<OperationPass<FuncOp>> CreateDeviceIndexSelectorPass();
-
 }  // namespace TFL
 
 }  // namespace mlir
diff --git a/tensorflow/compiler/mlir/lite/transforms/post_quantize.cc b/tensorflow/compiler/mlir/lite/transforms/post_quantize.cc
index 9a1da0ad03d..33380e00543 100644
--- a/tensorflow/compiler/mlir/lite/transforms/post_quantize.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/post_quantize.cc
@@ -52,7 +52,7 @@ class PostQuantizePass : public PassWrapper<PostQuantizePass, FunctionPass> {
 
 void RemoveQuantizationAdaptorOps(FuncOp func) {
   mlir::OpBuilder builder(func.getBody());
-  auto& bb = func.getBlocks().front();
+  auto& bb = func.front();
   auto* terminator = bb.getTerminator();
 
   int num_args = bb.getNumArguments();
diff --git a/tensorflow/compiler/mlir/lite/transforms/prepare_tf.cc b/tensorflow/compiler/mlir/lite/transforms/prepare_tf.cc
index 3310c521a5a..6ee988496fa 100644
--- a/tensorflow/compiler/mlir/lite/transforms/prepare_tf.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/prepare_tf.cc
@@ -584,46 +584,50 @@ struct ConvertTFStridedSlice : public RewritePattern {
 
     const int ellipsis_filled_dim_size = input_size - begin_shape[0] + 1;
 
-    llvm::APInt new_begin_mask = strided_slice_op.begin_mask();
-    llvm::APInt new_end_mask = strided_slice_op.end_mask();
+    int64_t begin_mask = strided_slice_op.begin_mask().getSExtValue();
+    int64_t end_mask = strided_slice_op.end_mask().getSExtValue();
+    int64_t new_begin_mask = 0;
+    int64_t new_end_mask = 0;
 
     SmallVector<int32_t, 4> padded_begin;
     SmallVector<int32_t, 4> padded_end;
     SmallVector<int32_t, 4> padded_stride;
 
     // Before the ellipsis.
-    uint64_t index = 1;
-    int count = 0;
-
-    while (index < ellipsis_mask) {
-      padded_begin.push_back(begin_dense_elem_attr.getValue<int32_t>(count));
-      padded_end.push_back(end_dense_elem_attr.getValue<int32_t>(count));
-      padded_stride.push_back(stride_dense_elem_attr.getValue<int32_t>(count));
-      index <<= 1;
-      count++;
+    int index = 0;
+    int new_index = 0;
+    while (((ellipsis_mask >> index) & 1) == 0) {
+      padded_begin.push_back(begin_dense_elem_attr.getValue<int32_t>(index));
+      padded_end.push_back(end_dense_elem_attr.getValue<int32_t>(index));
+      padded_stride.push_back(stride_dense_elem_attr.getValue<int32_t>(index));
+      if ((begin_mask >> index) & 1) new_begin_mask |= (1 << new_index);
+      if ((end_mask >> index) & 1) new_end_mask |= (1 << new_index);
+      ++index;
+      ++new_index;
     }
 
     // Ellipsis.
-    for (int i = 0; i < ellipsis_filled_dim_size; ++i) {
-      new_begin_mask |= ellipsis_mask;
-      new_end_mask |= ellipsis_mask;
+    for (; new_index < index + ellipsis_filled_dim_size; ++new_index) {
+      new_begin_mask |= (1 << new_index);
+      new_end_mask |= (1 << new_index);
 
       // Mimic the begin/end/strides mask behavior.
       padded_begin.push_back(0);
       padded_end.push_back(0);
       padded_stride.push_back(1);
-
-      ellipsis_mask <<= 1;
     }
 
     // Account for ellipsis mask.
-    count++;
+    ++index;
 
     // After the ellipsis.
-    for (; count < begin_shape[0]; ++count) {
-      padded_begin.push_back(begin_dense_elem_attr.getValue<int32_t>(count));
-      padded_end.push_back(end_dense_elem_attr.getValue<int32_t>(count));
-      padded_stride.push_back(stride_dense_elem_attr.getValue<int32_t>(count));
+    for (; index < begin_shape[0]; ++index) {
+      padded_begin.push_back(begin_dense_elem_attr.getValue<int32_t>(index));
+      padded_end.push_back(end_dense_elem_attr.getValue<int32_t>(index));
+      padded_stride.push_back(stride_dense_elem_attr.getValue<int32_t>(index));
+
+      if ((begin_mask >> index) & 1) new_begin_mask |= (1 << new_index);
+      if ((end_mask >> index) & 1) new_end_mask |= (1 << new_index);
     }
 
     auto attribute_type = rewriter.getIntegerType(64);
@@ -645,7 +649,7 @@ struct ConvertTFStridedSlice : public RewritePattern {
         end_op.getResult(), stride_op.getResult(),
         rewriter.getIntegerAttr(attribute_type, new_begin_mask),
         rewriter.getIntegerAttr(attribute_type, new_end_mask),
-        rewriter.getI64IntegerAttr(0),
+        /*ellipsis_maks=*/rewriter.getI64IntegerAttr(0),
         rewriter.getIntegerAttr(attribute_type,
                                 strided_slice_op.new_axis_mask()),
         rewriter.getIntegerAttr(attribute_type,
@@ -655,10 +659,12 @@ struct ConvertTFStridedSlice : public RewritePattern {
 
   LogicalResult matchAndRewrite(Operation *op,
                                 PatternRewriter &rewriter) const override {
-    // TODO(renjieliu): Consider expand the transformation for shrink
-    // mask as well.
     TF::StridedSliceOp strided_slice_op = llvm::cast<TF::StridedSliceOp>(op);
 
+    // TODO(renjieliu): Consider expand the transformation for shrink mask as
+    // well.
+    if (strided_slice_op.shrink_axis_mask().getZExtValue()) return failure();
+
     // Handle new axis mask.
     uint64_t new_axis_mask = strided_slice_op.new_axis_mask().getZExtValue();
     if (new_axis_mask != 0) {
diff --git a/tensorflow/compiler/mlir/lite/transforms/runtime_verify.cc b/tensorflow/compiler/mlir/lite/transforms/runtime_verify.cc
index 3268329b1c1..cc2e691180e 100644
--- a/tensorflow/compiler/mlir/lite/transforms/runtime_verify.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/runtime_verify.cc
@@ -34,8 +34,7 @@ class RuntimeVerifyPass
 
 void RuntimeVerifyPass::runOnFunction() {
   getFunction().walk([&](TflRuntimeVerifyOpInterface op) {
-    if (failed(op.VerifyTflRuntimeConstraints(
-            op.getOperation(), /*failure_on_operand_type_mismatch=*/true)))
+    if (failed(op.VerifyTflRuntimeConstraints(op.getOperation())))
       signalPassFailure();
   });
 }
diff --git a/tensorflow/compiler/mlir/tensorflow/BUILD b/tensorflow/compiler/mlir/tensorflow/BUILD
index 904ccb7e820..b159815d5eb 100644
--- a/tensorflow/compiler/mlir/tensorflow/BUILD
+++ b/tensorflow/compiler/mlir/tensorflow/BUILD
@@ -57,6 +57,7 @@ gentbl(
     td_srcs = [
         ":tensorflow_ops_td_files",
     ],
+    test = True,
 )
 
 gentbl(
@@ -88,6 +89,7 @@ gentbl(
     td_srcs = [
         ":tensorflow_ops_td_files",
     ],
+    test = True,
 )
 
 gentbl(
@@ -112,6 +114,7 @@ gentbl(
         "@llvm-project//mlir:include/mlir/IR/OpBase.td",
         "@llvm-project//mlir:include/mlir/Dialect/StandardOps/IR/Ops.td",
     ],
+    test = True,
 )
 
 gentbl(
@@ -137,6 +140,7 @@ gentbl(
         "@llvm-project//mlir:include/mlir/Dialect/StandardOps/IR/Ops.td",
         "@llvm-project//mlir:include/mlir/IR/OpBase.td",
     ],
+    test = True,
 )
 
 gentbl(
@@ -161,6 +165,7 @@ gentbl(
         "@llvm-project//mlir:include/mlir/IR/OpBase.td",
         "@llvm-project//mlir:include/mlir/Dialect/StandardOps/IR/Ops.td",
     ],
+    test = True,
 )
 
 gentbl(
@@ -475,6 +480,7 @@ cc_library(
         "transforms/cluster_outlining.cc",
         "transforms/collection_ops_util.cc",
         "transforms/decompose_resource_ops_pass.cc",
+        "transforms/device_index_selector.cc",
         "transforms/einsum.cc",
         "transforms/executor_island_coarsening.cc",
         "transforms/executor_tpuv1_inline_tpu_island.cc",
@@ -491,7 +497,6 @@ cc_library(
         "transforms/graph_pruning.cc",
         "transforms/launch_to_device_attribute.cc",
         "transforms/layout_optimization.cc",
-        "transforms/mark_function_visibility.cc",
         "transforms/materialize_mlir_passthrough_op.cc",
         "transforms/optimize.cc",
         "transforms/optimize_global_tensors.cc",
@@ -661,7 +666,9 @@ cc_library(
         ":tensorflow_types",
         ":translate_utils",
         "//tensorflow/cc/saved_model:bundle_v2",
+        "//tensorflow/cc/saved_model:constants",
         "//tensorflow/cc/saved_model:loader_lite",
+        "//tensorflow/cc/saved_model:loader_util",
         "//tensorflow/compiler/jit:shape_inference_helpers",
         "//tensorflow/compiler/mlir:op_or_arg_name_mapper",
         "//tensorflow/compiler/tf2xla:functionalize_control_flow",
@@ -673,6 +680,7 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/grappler/utils:transitive_fanin",
+        "//tensorflow/core/platform:protobuf_internal",
         "//tensorflow/core/platform:types",
         "//tensorflow/stream_executor/lib",
         "@com_google_absl//absl/algorithm:container",
@@ -682,7 +690,6 @@ cc_library(
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:optional",
         "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:Analysis",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:StandardOps",
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_device.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_device.cc
index b8f0585040c..7dd74282487 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_device.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_device.cc
@@ -299,13 +299,13 @@ ParseResult ParseReplicateOp(OpAsmParser* parser, OperationState* state) {
       parser->parseRegion(body, region_args, region_arg_types))
     return failure();
 
-  if (body.getBlocks().size() > 1)
-    return parser->emitError(loc) << "expects a single block region";
-
   // Ensure that the region is well formed: it contains at least a block with
   // a ReturnOp terminator.
   ReplicateOp::ensureTerminator(body, parser->getBuilder(), state->location);
 
+  if (!llvm::hasSingleElement(body))
+    return parser->emitError(loc) << "expects a single block region";
+
   Operation& terminator = body.front().back();
   if (!isa<ReturnOp>(terminator))
     return parser->emitError(loc) << "expects a tf_device.return terminator";
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_executor.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_executor.cc
index 3403651eef8..1e66eee06bb 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_executor.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_executor.cc
@@ -220,13 +220,13 @@ ParseResult ParseGraphOp(OpAsmParser &parser, OperationState &result) {
   Region &body = *result.addRegion();
   if (parser.parseRegion(body, llvm::None, llvm::None)) return failure();
 
-  if (body.getBlocks().size() > 1)
-    return parser.emitError(loc) << "expects a single block region";
-
   // Ensure that the region is well formed: it contains at least a block with
   // a FetchOp terminator.
   GraphOp::ensureTerminator(body, parser.getBuilder(), result.location);
 
+  if (!llvm::hasSingleElement(body))
+    return parser.emitError(loc) << "expects a single block region";
+
   // Get the results type from the terminator type inside the graph.
   Operation &fetch = body.back().back();
   if (!isa<FetchOp>(fetch))
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
index d403462e6a6..65ca3ea4dbd 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
@@ -52,15 +52,12 @@ an output element, this operation computes \\(y = |x|\\).
 def TF_AcosOp : TF_Op<"Acos", [NoSideEffect, SameOperandsAndResultType]> {
   let summary = "Computes acos of x element-wise.";
 
-  let description = [{
-  }];
-
   let arguments = (ins
-    TensorOf<[BF16, F16, F32, F64, I32, I64, TF_Complex128, TF_Complex64]>:$x
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64]>:$x
   );
 
   let results = (outs
-    TensorOf<[BF16, F16, F32, F64, I32, I64, TF_Complex128, TF_Complex64]>:$y
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64]>:$y
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -164,6 +161,81 @@ def TF_AddV2Op : TF_Op<"AddV2", [Commutative, NoSideEffect, ResultsBroadcastable
   let hasFolder = 1;
 }
 
+def TF_AdjustContrastv2Op : TF_Op<"AdjustContrastv2", [NoSideEffect]> {
+  let summary = "Adjust the contrast of one or more images.";
+
+  let description = [{
+`images` is a tensor of at least 3 dimensions.  The last 3 dimensions are
+interpreted as `[height, width, channels]`.  The other dimensions only
+represent a collection of images, such as `[batch, height, width, channels].`
+
+Contrast is adjusted independently for each channel of each image.
+
+For each channel, the Op first computes the mean of the image pixels in the
+channel and then adjusts each component of each pixel to
+`(x - mean) * contrast_factor + mean`.
+  }];
+
+  let arguments = (ins
+    TensorOf<[F16, F32]>:$images,
+    F32Tensor:$contrast_factor
+  );
+
+  let results = (outs
+    TensorOf<[F16, F32]>:$output
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+}
+
+def TF_AdjustHueOp : TF_Op<"AdjustHue", [NoSideEffect]> {
+  let summary = "Adjust the hue of one or more images.";
+
+  let description = [{
+`images` is a tensor of at least 3 dimensions.  The last dimension is
+interpreted as channels, and must be three.
+
+The input image is considered in the RGB colorspace. Conceptually, the RGB
+colors are first mapped into HSV. A delta is then applied all the hue values,
+and then remapped back to RGB colorspace.
+  }];
+
+  let arguments = (ins
+    TensorOf<[F16, F32]>:$images,
+    F32Tensor:$delta
+  );
+
+  let results = (outs
+    TensorOf<[F16, F32]>:$output
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+}
+
+def TF_AdjustSaturationOp : TF_Op<"AdjustSaturation", [NoSideEffect]> {
+  let summary = "Adjust the saturation of one or more images.";
+
+  let description = [{
+`images` is a tensor of at least 3 dimensions.  The last dimension is
+interpreted as channels, and must be three.
+
+The input image is considered in the RGB colorspace. Conceptually, the RGB
+colors are first mapped into HSV. A scale is then applied all the saturation
+values, and then remapped back to RGB colorspace.
+  }];
+
+  let arguments = (ins
+    TensorOf<[F16, F32]>:$images,
+    F32Tensor:$scale
+  );
+
+  let results = (outs
+    TensorOf<[F16, F32]>:$output
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+}
+
 def TF_AllOp : TF_Op<"All", [NoSideEffect]> {
   let summary = [{
 Computes the "logical and" of elements across dimensions of a tensor.
@@ -296,9 +368,6 @@ retained with length 1.
 def TF_ApproximateEqualOp : TF_Op<"ApproximateEqual", [Commutative, NoSideEffect]> {
   let summary = "Returns the truth value of abs(x-y) < tolerance element-wise.";
 
-  let description = [{
-  }];
-
   let arguments = (ins
     TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$x,
     TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$y,
@@ -436,11 +505,11 @@ tf.math.asin(y) # [1.047, 0.785] = x
   }];
 
   let arguments = (ins
-    TensorOf<[BF16, F16, F32, F64, I32, I64, TF_Complex128, TF_Complex64]>:$x
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64]>:$x
   );
 
   let results = (outs
-    TensorOf<[BF16, F16, F32, F64, I32, I64, TF_Complex128, TF_Complex64]>:$y
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64]>:$y
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -569,11 +638,11 @@ tf.math.atan(y) # [1.047, 0.785] = x
   }];
 
   let arguments = (ins
-    TensorOf<[BF16, F16, F32, F64, I32, I64, TF_Complex128, TF_Complex64]>:$x
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64]>:$x
   );
 
   let results = (outs
-    TensorOf<[BF16, F16, F32, F64, I32, I64, TF_Complex128, TF_Complex64]>:$y
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64]>:$y
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -659,9 +728,6 @@ window in `value`.
 def TF_AvgPoolGradOp : TF_Op<"AvgPoolGrad", [NoSideEffect]> {
   let summary = "Computes gradients of the average pooling function.";
 
-  let description = [{
-  }];
-
   let arguments = (ins
     I32Tensor:$orig_input_shape,
     TF_FpTensor:$grad,
@@ -855,48 +921,6 @@ reverse of SpaceToBatch.  See below for a precise description.
   TF_DerivedOperandTypeAttr Tblock_shape = TF_DerivedOperandTypeAttr<1>;
 }
 
-def TF_BesselI0eOp : TF_Op<"BesselI0e", [NoSideEffect, SameOperandsAndResultType]> {
-  let summary = "Computes the Bessel i0e function of `x` element-wise.";
-
-  let description = [{
-Exponentially scaled modified Bessel function of order 0 defined as
-`bessel_i0e(x) = exp(-abs(x)) bessel_i0(x)`.
-
-This function is faster and numerically stabler than `bessel_i0(x)`.
-  }];
-
-  let arguments = (ins
-    TF_FpTensor:$x
-  );
-
-  let results = (outs
-    TF_FpTensor:$y
-  );
-
-  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
-}
-
-def TF_BesselI1eOp : TF_Op<"BesselI1e", [NoSideEffect, SameOperandsAndResultType]> {
-  let summary = "Computes the Bessel i1e function of `x` element-wise.";
-
-  let description = [{
-Exponentially scaled modified Bessel function of order 0 defined as
-`bessel_i1e(x) = exp(-abs(x)) bessel_i1(x)`.
-
-This function is faster and numerically stabler than `bessel_i1(x)`.
-  }];
-
-  let arguments = (ins
-    TF_FpTensor:$x
-  );
-
-  let results = (outs
-    TF_FpTensor:$y
-  );
-
-  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
-}
-
 def TF_BiasAddOp : TF_Op<"BiasAdd", [NoSideEffect]> {
   let summary = "Adds `bias` to `value`.";
 
@@ -1327,9 +1351,6 @@ An n-way switch statement, implementing the following:
 def TF_CastOp : TF_Op<"Cast", [NoSideEffect, SameOperandsAndResultShape]> {
   let summary = "Cast x of type SrcT to y of DstT.";
 
-  let description = [{
-  }];
-
   let arguments = (ins
     TF_Tensor:$x,
 
@@ -1349,9 +1370,6 @@ def TF_CastOp : TF_Op<"Cast", [NoSideEffect, SameOperandsAndResultShape]> {
 def TF_CeilOp : TF_Op<"Ceil", [NoSideEffect, SameOperandsAndResultType]> {
   let summary = "Returns element-wise smallest integer not less than x.";
 
-  let description = [{
-  }];
-
   let arguments = (ins
     TF_FpTensor:$x
   );
@@ -1410,9 +1428,6 @@ greater than `clip_value_max` are set to `clip_value_max`.
 def TF_CollectiveBcastRecvOp : TF_Op<"CollectiveBcastRecv", []> {
   let summary = "Receives a tensor value broadcast from another device.";
 
-  let description = [{
-  }];
-
   let arguments = (ins
     I64Attr:$group_size,
     I64Attr:$group_key,
@@ -1432,9 +1447,6 @@ def TF_CollectiveBcastRecvOp : TF_Op<"CollectiveBcastRecv", []> {
 def TF_CollectiveBcastSendOp : TF_Op<"CollectiveBcastSend", []> {
   let summary = "Broadcasts a tensor value to one or more other devices.";
 
-  let description = [{
-  }];
-
   let arguments = (ins
     TensorOf<[F16, F32, F64, I1, I32, I64]>:$input,
 
@@ -1458,9 +1470,6 @@ def TF_CollectiveGatherOp : TF_Op<"CollectiveGather", []> {
 Mutually accumulates multiple tensors of identical type and shape.
   }];
 
-  let description = [{
-  }];
-
   let arguments = (ins
     TensorOf<[F16, F32, F64, I32, I64]>:$input,
 
@@ -1484,9 +1493,6 @@ def TF_CollectiveReduceOp : TF_Op<"CollectiveReduce", [SameOperandsAndResultType
 Mutually reduces multiple tensors of identical type and shape.
   }];
 
-  let description = [{
-  }];
-
   let arguments = (ins
     TensorOf<[F16, F32, F64, I32, I64]>:$input,
 
@@ -1566,9 +1572,6 @@ value is computed as \\( \sqrt{a^2 + b^2}\\).
 def TF_ConcatOp : TF_Op<"Concat", [NoSideEffect]> {
   let summary = "Concatenates tensors along one dimension.";
 
-  let description = [{
-  }];
-
   let arguments = (ins
     I32Tensor:$concat_dim,
     Variadic<TF_Tensor>:$values
@@ -1625,9 +1628,6 @@ This is typically used by gradient computations for a concat operation.
 def TF_ConcatV2Op : TF_Op<"ConcatV2", [NoSideEffect]> {
   let summary = "Concatenates tensors along one dimension.";
 
-  let description = [{
-  }];
-
   let arguments = (ins
     Variadic<TF_Tensor>:$values,
     TF_I32OrI64Tensor:$axis
@@ -1767,9 +1767,6 @@ def TF_Conv2DBackpropFilterOp : TF_Op<"Conv2DBackpropFilter", [NoSideEffect, TF_
 Computes the gradients of convolution with respect to the filter.
   }];
 
-  let description = [{
-  }];
-
   let arguments = (ins
     TF_FpTensor:$input,
     I32Tensor:$filter_sizes,
@@ -1803,9 +1800,6 @@ def TF_Conv2DBackpropInputOp : TF_Op<"Conv2DBackpropInput", [NoSideEffect, TF_La
 Computes the gradients of convolution with respect to the input.
   }];
 
-  let description = [{
-  }];
-
   let arguments = (ins
     I32Tensor:$input_sizes,
     TensorOf<[BF16, F16, F32, F64, I32]>:$filter,
@@ -1877,9 +1871,6 @@ def TF_Conv3DBackpropFilterV2Op : TF_Op<"Conv3DBackpropFilterV2", [NoSideEffect]
 Computes the gradients of 3-D convolution with respect to the filter.
   }];
 
-  let description = [{
-  }];
-
   let arguments = (ins
     TF_FpTensor:$input,
     I32Tensor:$filter_sizes,
@@ -1903,9 +1894,6 @@ def TF_Conv3DBackpropInputV2Op : TF_Op<"Conv3DBackpropInputV2", [NoSideEffect]>
 Computes the gradients of 3-D convolution with respect to the input.
   }];
 
-  let description = [{
-  }];
-
   let arguments = (ins
     TF_I32OrI64Tensor:$input_sizes,
     TF_FpTensor:$filter,
@@ -2391,6 +2379,10 @@ def TF_DeviceIndexOp : TF_Op<"DeviceIndex", [NoSideEffect]> {
   let summary = "Return the index of device the op runs.";
 
   let description = [{
+Given a list of device names, this operation returns the index of the device
+this op runs. The length of the list is returned in two cases:
+(1) Device does not exist in the given device list.
+(2) It is in XLA compilation.
   }];
 
   let arguments = (ins
@@ -2717,9 +2709,6 @@ def TF_EluGradOp : TF_Op<"EluGrad", [NoSideEffect, SameOperandsAndResultType]> {
 Computes gradients for the exponential linear (Elu) operation.
   }];
 
-  let description = [{
-  }];
-
   let arguments = (ins
     TF_FpTensor:$gradients,
     TF_FpTensor:$outputs
@@ -2739,9 +2728,6 @@ Creates a tensor with the given shape.
 This operation creates a tensor of `shape` and `dtype`.
   }];
 
-  let description = [{
-  }];
-
   let arguments = (ins
     I32Tensor:$shape,
 
@@ -2827,6 +2813,27 @@ the corresponding feature.
   TF_DerivedOperandSizeAttr N = TF_DerivedOperandSizeAttr<0>;
 }
 
+def TF_EnsureShapeOp : TF_Op<"EnsureShape", [NoSideEffect]> {
+  let summary = "Ensures that the tensor's shape matches the expected shape.";
+
+  let description = [{
+Raises an error if the input tensor's shape does not match the specified shape.
+Returns the input tensor otherwise.
+  }];
+
+  let arguments = (ins
+    TF_Tensor:$input,
+
+    TF_ShapeAttr:$shape
+  );
+
+  let results = (outs
+    TF_Tensor:$output
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+}
+
 def TF_EqualOp : TF_Op<"Equal", [Commutative, NoSideEffect]> {
   let summary = "Returns the truth value of (x == y) element-wise.";
 
@@ -2871,9 +2878,6 @@ tf.math.equal(x, y) ==> array([True,  True])
 def TF_ErfOp : TF_Op<"Erf", [NoSideEffect, SameOperandsAndResultType]> {
   let summary = "Computes the Gauss error function of `x` element-wise.";
 
-  let description = [{
-  }];
-
   let arguments = (ins
     TF_FpTensor:$x
   );
@@ -2890,9 +2894,6 @@ def TF_ErfcOp : TF_Op<"Erfc", [NoSideEffect, SameOperandsAndResultType]> {
 Computes the complementary error function of `x` element-wise.
   }];
 
-  let description = [{
-  }];
-
   let arguments = (ins
     TF_FpTensor:$x
   );
@@ -2907,9 +2908,6 @@ Computes the complementary error function of `x` element-wise.
 def TF_ErfinvOp : TF_Op<"Erfinv", [NoSideEffect]> {
   let summary = "";
 
-  let description = [{
-  }];
-
   let arguments = (ins
     TF_FpTensor:$x
   );
@@ -3107,6 +3105,25 @@ dimensions of `input`.
   TF_DerivedOperandTypeAttr Tcomplex = TF_DerivedOperandTypeAttr<0>;
 }
 
+def TF_FakeParamOp : TF_Op<"FakeParam", [NoSideEffect]> {
+  let summary = [{
+  This op is used as a placeholder in If branch functions. It doesn't provide a
+  valid output when run, so must either be removed (e.g. replaced with a
+  function input) or guaranteed not to be used (e.g. if mirroring an
+  intermediate output needed for the gradient computation of the other branch).
+  }];
+
+  let arguments = (ins
+    TF_ShapeAttr:$shape
+  );
+
+  let results = (outs
+    TF_Tensor:$output
+  );
+
+  TF_DerivedResultTypeAttr dtype = TF_DerivedResultTypeAttr<0>;
+}
+
 def TF_FakeQuantWithMinMaxArgsOp : TF_Op<"FakeQuantWithMinMaxArgs", [NoSideEffect, SameOperandsAndResultType]> {
   let summary = [{
 Fake-quantize the 'inputs' tensor, type float to 'outputs' tensor of same type.
@@ -3305,9 +3322,6 @@ fill([2, 3], 9) ==> [[9, 9, 9]
 def TF_FloorOp : TF_Op<"Floor", [NoSideEffect, SameOperandsAndResultType]> {
   let summary = "Returns element-wise largest integer not greater than x.";
 
-  let description = [{
-  }];
-
   let arguments = (ins
     TF_FpTensor:$x
   );
@@ -3844,6 +3858,28 @@ tf.math.greater_equal(x, y) ==> [True, False, True, True]
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 }
 
+def TF_HSVToRGBOp : TF_Op<"HSVToRGB", [NoSideEffect]> {
+  let summary = "Convert one or more images from HSV to RGB.";
+
+  let description = [{
+Outputs a tensor of the same shape as the `images` tensor, containing the RGB
+value of the pixels. The output is only well defined if the value in `images`
+are in `[0,1]`.
+
+See `rgb_to_hsv` for a description of the HSV encoding.
+  }];
+
+  let arguments = (ins
+    TF_FpTensor:$images
+  );
+
+  let results = (outs
+    TF_FpTensor:$output
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+}
+
 def TF_HashTableV2Op : TF_Op<"HashTableV2", []> {
   let summary = "Creates a non-initialized hash table.";
 
@@ -4093,9 +4129,6 @@ def TF_IgammaGradAOp : TF_Op<"IgammaGradA", [NoSideEffect, ResultsBroadcastableS
                        WithBroadcastableBinOpBuilder {
   let summary = "Computes the gradient of `igamma(a, x)` wrt `a`.";
 
-  let description = [{
-  }];
-
   let arguments = (ins
     TF_F32OrF64Tensor:$a,
     TF_F32OrF64Tensor:$x
@@ -4178,11 +4211,11 @@ I.e., \\(y = 1 / x\\).
   }];
 
   let arguments = (ins
-    TensorOf<[BF16, F16, F32, F64, I32, I64, TF_Complex128, TF_Complex64]>:$x
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64]>:$x
   );
 
   let results = (outs
-    TensorOf<[BF16, F16, F32, F64, I32, I64, TF_Complex128, TF_Complex64]>:$y
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64]>:$y
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -4368,9 +4401,6 @@ tf.math.is_nan(x) ==> [False, True, False, True, False]
 def TF_IteratorGetNextOp : TF_Op<"IteratorGetNext", []> {
   let summary = "Gets the next output from the given iterator .";
 
-  let description = [{
-  }];
-
   let arguments = (ins
     TF_ResourceTensor:$iterator
   );
@@ -4439,9 +4469,6 @@ convolutional neural networks (NIPS 2012)](http://papers.nips.cc/paper/4824-imag
 def TF_LRNGradOp : TF_Op<"LRNGrad", [NoSideEffect]> {
   let summary = "Gradients for Local Response Normalization.";
 
-  let description = [{
-  }];
-
   let arguments = (ins
     TensorOf<[BF16, F16, F32]>:$input_grads,
     TensorOf<[BF16, F16, F32]>:$input_image,
@@ -4463,9 +4490,6 @@ def TF_LRNGradOp : TF_Op<"LRNGrad", [NoSideEffect]> {
 def TF_LeakyReluOp : TF_Op<"LeakyRelu", [NoSideEffect, SameOperandsAndResultType]> {
   let summary = "Computes rectified linear: `max(features, features * alpha)`.";
 
-  let description = [{
-  }];
-
   let arguments = (ins
     TF_FpTensor:$features,
 
@@ -4486,9 +4510,6 @@ def TF_LeakyReluGradOp : TF_Op<"LeakyReluGrad", [NoSideEffect, SameOperandsAndRe
 Computes rectified linear gradients for a LeakyRelu operation.
   }];
 
-  let description = [{
-  }];
-
   let arguments = (ins
     TF_FpTensor:$gradients,
     TF_FpTensor:$features,
@@ -4769,9 +4790,6 @@ def TF_LogicalAndOp : TF_Op<"LogicalAnd", [Commutative, NoSideEffect, ResultsBro
 def TF_LogicalNotOp : TF_Op<"LogicalNot", [NoSideEffect, SameOperandsAndResultType]> {
   let summary = "Returns the truth value of `NOT x` element-wise.";
 
-  let description = [{
-  }];
-
   let arguments = (ins
     I1Tensor:$x
   );
@@ -4852,9 +4870,6 @@ The tensor `values` must be of the type of the table values.
 def TF_LookupTableSizeV2Op : TF_Op<"LookupTableSizeV2", []> {
   let summary = "Computes the number of elements in the given table.";
 
-  let description = [{
-  }];
-
   let arguments = (ins
     TF_ResourceTensor:$table_handle
   );
@@ -5539,9 +5554,6 @@ retained with length 1.
 def TF_MaxPoolOp : TF_Op<"MaxPool", [NoSideEffect, TF_FoldOperandsTransposeInterface]> {
   let summary = "Performs max pooling on the input.";
 
-  let description = [{
-  }];
-
   let arguments = (ins
     TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Qint8, TF_Uint16, TF_Uint8]>:$input,
 
@@ -5568,9 +5580,6 @@ def TF_MaxPoolOp : TF_Op<"MaxPool", [NoSideEffect, TF_FoldOperandsTransposeInter
 def TF_MaxPool3DOp : TF_Op<"MaxPool3D", [NoSideEffect]> {
   let summary = "Performs 3D max pooling on the input.";
 
-  let description = [{
-  }];
-
   let arguments = (ins
     TensorOf<[BF16, F16, F32]>:$input,
 
@@ -5590,9 +5599,6 @@ def TF_MaxPool3DOp : TF_Op<"MaxPool3D", [NoSideEffect]> {
 def TF_MaxPool3DGradOp : TF_Op<"MaxPool3DGrad", [NoSideEffect]> {
   let summary = "Computes gradients of 3D max pooling function.";
 
-  let description = [{
-  }];
-
   let arguments = (ins
     TensorOf<[BF16, F16, F32]>:$orig_input,
     TensorOf<[BF16, F16, F32]>:$orig_output,
@@ -5615,9 +5621,6 @@ def TF_MaxPool3DGradOp : TF_Op<"MaxPool3DGrad", [NoSideEffect]> {
 def TF_MaxPoolGradOp : TF_Op<"MaxPoolGrad", [NoSideEffect]> {
   let summary = "Computes gradients of the maxpooling function.";
 
-  let description = [{
-  }];
-
   let arguments = (ins
     TF_IntOrFpTensor:$orig_input,
     TF_IntOrFpTensor:$orig_output,
@@ -5896,9 +5899,6 @@ Returns x * y element-wise. Returns zero if y is zero, even if x if infinite or
 def TF_MultinomialOp : TF_Op<"Multinomial", []> {
   let summary = "Draws samples from a multinomial distribution.";
 
-  let description = [{
-  }];
-
   let arguments = (ins
     TF_IntOrFpTensor:$logits,
     I32Tensor:$num_samples,
@@ -5918,9 +5918,6 @@ def TF_MultinomialOp : TF_Op<"Multinomial", []> {
 def TF_NdtriOp : TF_Op<"Ndtri", [NoSideEffect]> {
   let summary = "";
 
-  let description = [{
-  }];
-
   let arguments = (ins
     TF_FpTensor:$x
   );
@@ -5940,11 +5937,11 @@ I.e., \\(y = -x\\).
   }];
 
   let arguments = (ins
-    TensorOf<[BF16, F16, F32, F64, I32, I64, TF_Complex128, TF_Complex64]>:$x
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64]>:$x
   );
 
   let results = (outs
-    TensorOf<[BF16, F16, F32, F64, I32, I64, TF_Complex128, TF_Complex64]>:$y
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64]>:$y
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -5955,9 +5952,6 @@ I.e., \\(y = -x\\).
 def TF_NoOp : TF_Op<"NoOp", [NoSideEffect]> {
   let summary = "Does nothing. Only useful as a placeholder for control edges.";
 
-  let description = [{
-  }];
-
   let arguments = (ins);
 
   let results = (outs);
@@ -6211,9 +6205,6 @@ output =
 def TF_OutfeedEnqueueTupleOp : TF_Op<"OutfeedEnqueueTuple", []> {
   let summary = "Enqueue multiple Tensor values on the computation outfeed.";
 
-  let description = [{
-  }];
-
   let arguments = (ins
     Variadic<TF_Tensor>:$inputs
   );
@@ -6498,9 +6489,6 @@ q_full, r_full = qr(a, full_matrices=True)
 def TF_QuantizeAndDequantizeOp : TF_Op<"QuantizeAndDequantize", [NoSideEffect, SameOperandsAndResultType]> {
   let summary = "Use QuantizeAndDequantizeV2 instead.";
 
-  let description = [{
-  }];
-
   let arguments = (ins
     TF_FpTensor:$input,
 
@@ -6711,15 +6699,47 @@ the dimension is padded with zeros.
   TF_DerivedResultTypeAttr Tcomplex = TF_DerivedResultTypeAttr<0>;
 }
 
+def TF_RGBToHSVOp : TF_Op<"RGBToHSV", [NoSideEffect]> {
+  let summary = "Converts one or more images from RGB to HSV.";
+
+  let description = [{
+Outputs a tensor of the same shape as the `images` tensor, containing the HSV
+value of the pixels. The output is only well defined if the value in `images`
+are in `[0,1]`.
+
+`output[..., 0]` contains hue, `output[..., 1]` contains saturation, and
+`output[..., 2]` contains value. All HSV values are in `[0,1]`. A hue of 0
+corresponds to pure red, hue 1/3 is pure green, and 2/3 is pure blue.
+
+Usage Example:
+
+>>> blue_image = tf.stack([
+...    tf.zeros([5,5]),
+...    tf.zeros([5,5]),
+...    tf.ones([5,5])],
+...    axis=-1)
+>>> blue_hsv_image = tf.image.rgb_to_hsv(blue_image)
+>>> blue_hsv_image[0,0].numpy()
+array([0.6666667, 1. , 1. ], dtype=float32)
+  }];
+
+  let arguments = (ins
+    TF_FpTensor:$images
+  );
+
+  let results = (outs
+    TF_FpTensor:$output
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+}
+
 def TF_RandomGammaGradOp : TF_Op<"RandomGammaGrad", [NoSideEffect, ResultsBroadcastableShape]>,
                            WithBroadcastableBinOpBuilder {
   let summary = [{
 Computes the derivative of a Gamma random sample w.r.t. `alpha`.
   }];
 
-  let description = [{
-  }];
-
   let arguments = (ins
     TF_F32OrF64Tensor:$alpha,
     TF_F32OrF64Tensor:$sample
@@ -6970,11 +6990,11 @@ I.e., \\(y = 1 / x\\).
   }];
 
   let arguments = (ins
-    TensorOf<[BF16, F16, F32, F64, I32, I64, TF_Complex128, TF_Complex64]>:$x
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64]>:$x
   );
 
   let results = (outs
-    TensorOf<[BF16, F16, F32, F64, I32, I64, TF_Complex128, TF_Complex64]>:$y
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64]>:$y
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -7049,9 +7069,6 @@ array([ 0.,  0., -0.,  3.], dtype=float32)
 def TF_Relu6Op : TF_Op<"Relu6", [NoSideEffect, SameOperandsAndResultType]> {
   let summary = "Computes rectified linear 6: `min(max(features, 0), 6)`.";
 
-  let description = [{
-  }];
-
   let arguments = (ins
     TF_IntOrFpTensor:$features
   );
@@ -7066,9 +7083,6 @@ def TF_Relu6Op : TF_Op<"Relu6", [NoSideEffect, SameOperandsAndResultType]> {
 def TF_Relu6GradOp : TF_Op<"Relu6Grad", [NoSideEffect, SameOperandsAndResultType]> {
   let summary = "Computes rectified linear 6 gradients for a Relu6 operation.";
 
-  let description = [{
-  }];
-
   let arguments = (ins
     TF_IntOrFpTensor:$gradients,
     TF_IntOrFpTensor:$features
@@ -7084,9 +7098,6 @@ def TF_Relu6GradOp : TF_Op<"Relu6Grad", [NoSideEffect, SameOperandsAndResultType
 def TF_ReluGradOp : TF_Op<"ReluGrad", [NoSideEffect, SameOperandsAndResultType]> {
   let summary = "Computes rectified linear gradients for a Relu operation.";
 
-  let description = [{
-  }];
-
   let arguments = (ins
     TF_IntOrFpTensor:$gradients,
     TF_IntOrFpTensor:$features
@@ -7208,14 +7219,29 @@ Input images can be of different types but output images are always float.
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 }
 
+def TF_ResizeBilinearGradOp : TF_Op<"ResizeBilinearGrad", [NoSideEffect]> {
+  let summary = "Computes the gradient of bilinear interpolation.";
+
+  let arguments = (ins
+    F32Tensor:$grads,
+    TF_FpTensor:$original_image,
+
+    DefaultValuedAttr<BoolAttr, "false">:$align_corners,
+    DefaultValuedAttr<BoolAttr, "false">:$half_pixel_centers
+  );
+
+  let results = (outs
+    TF_FpTensor:$output
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<1>;
+}
+
 def TF_ResizeNearestNeighborOp : TF_Op<"ResizeNearestNeighbor", [NoSideEffect]> {
   let summary = [{
 Resize `images` to `size` using nearest neighbor interpolation.
   }];
 
-  let description = [{
-  }];
-
   let arguments = (ins
     TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Uint16, TF_Uint8]>:$images,
     I32Tensor:$size,
@@ -7332,9 +7358,6 @@ var <- var - mom
 def TF_ResourceApplyGradientDescentOp : TF_Op<"ResourceApplyGradientDescent", []> {
   let summary = "Update '*var' by subtracting 'alpha' * 'delta' from it.";
 
-  let description = [{
-  }];
-
   let arguments = (ins
     TF_ResourceTensor:$var,
     TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$alpha,
@@ -7697,11 +7720,11 @@ according to the current system rounding mode use std::cint.
   }];
 
   let arguments = (ins
-    TensorOf<[BF16, F16, F32, F64, I32, I64, TF_Complex128, TF_Complex64]>:$x
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64]>:$x
   );
 
   let results = (outs
-    TensorOf<[BF16, F16, F32, F64, I32, I64, TF_Complex128, TF_Complex64]>:$y
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64]>:$y
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -8117,9 +8140,6 @@ select(condition, t, e) ==> [[1, 2],
 def TF_SelectV2Op : TF_Op<"SelectV2", [NoSideEffect, ResultsBroadcastableShape]> {
   let summary = "";
 
-  let description = [{
-  }];
-
   let arguments = (ins
     I1Tensor:$condition,
     TF_Tensor:$t,
@@ -8168,9 +8188,6 @@ def TF_SeluGradOp : TF_Op<"SeluGrad", [NoSideEffect, SameOperandsAndResultType]>
 Computes gradients for the scaled exponential linear (Selu) operation.
   }];
 
-  let description = [{
-  }];
-
   let arguments = (ins
     TF_FpTensor:$gradients,
     TF_FpTensor:$outputs
@@ -8421,9 +8438,6 @@ whose values are extracted from 'input' starting at the offsets in
 def TF_SnapshotOp : TF_Op<"Snapshot", [NoSideEffect, SameOperandsAndResultType]> {
   let summary = "Returns a copy of the input tensor.";
 
-  let description = [{
-  }];
-
   let arguments = (ins
     TF_Tensor:$input
   );
@@ -8488,9 +8502,6 @@ Inputs are the logits, not probabilities.
 def TF_SoftplusOp : TF_Op<"Softplus", [NoSideEffect, SameOperandsAndResultType]> {
   let summary = "Computes softplus: `log(exp(features) + 1)`.";
 
-  let description = [{
-  }];
-
   let arguments = (ins
     TF_FpTensor:$features
   );
@@ -8505,9 +8516,6 @@ def TF_SoftplusOp : TF_Op<"Softplus", [NoSideEffect, SameOperandsAndResultType]>
 def TF_SoftplusGradOp : TF_Op<"SoftplusGrad", [NoSideEffect, SameOperandsAndResultType]> {
   let summary = "Computes softplus gradients for a softplus operation.";
 
-  let description = [{
-  }];
-
   let arguments = (ins
     TF_FpTensor:$gradients,
     TF_FpTensor:$features
@@ -8523,9 +8531,6 @@ def TF_SoftplusGradOp : TF_Op<"SoftplusGrad", [NoSideEffect, SameOperandsAndResu
 def TF_SoftsignOp : TF_Op<"Softsign", [NoSideEffect, SameOperandsAndResultType]> {
   let summary = "Computes softsign: `features / (abs(features) + 1)`.";
 
-  let description = [{
-  }];
-
   let arguments = (ins
     TF_FpTensor:$features
   );
@@ -8540,9 +8545,6 @@ def TF_SoftsignOp : TF_Op<"Softsign", [NoSideEffect, SameOperandsAndResultType]>
 def TF_SoftsignGradOp : TF_Op<"SoftsignGrad", [NoSideEffect, SameOperandsAndResultType]> {
   let summary = "Computes softsign gradients for a softsign operation.";
 
-  let description = [{
-  }];
-
   let arguments = (ins
     TF_FpTensor:$gradients,
     TF_FpTensor:$features
@@ -8790,9 +8792,6 @@ are checked during execution.
 def TF_SplitOp : TF_Op<"Split", [NoSideEffect]> {
   let summary = "Splits a tensor into `num_split` tensors along one dimension.";
 
-  let description = [{
-  }];
-
   let arguments = (ins
     I32Tensor:$split_dim,
     TF_Tensor:$value
@@ -8811,9 +8810,6 @@ def TF_SplitOp : TF_Op<"Split", [NoSideEffect]> {
 def TF_SplitVOp : TF_Op<"SplitV", [NoSideEffect]> {
   let summary = "Splits a tensor into `num_split` tensors along one dimension.";
 
-  let description = [{
-  }];
-
   let arguments = (ins
     TF_Tensor:$value,
     TF_I32OrI64Tensor:$size_splits,
@@ -8877,11 +8873,11 @@ I.e., \\(y = x * x = x^2\\).
   }];
 
   let arguments = (ins
-    TensorOf<[BF16, F16, F32, F64, I32, I64, TF_Complex128, TF_Complex64]>:$x
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64]>:$x
   );
 
   let results = (outs
-    TensorOf<[BF16, F16, F32, F64, I32, I64, TF_Complex128, TF_Complex64]>:$y
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64]>:$y
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -8950,9 +8946,6 @@ shape(squeeze(t, [2, 4])) ==> [1, 2, 3, 1]
 def TF_StackCloseV2Op : TF_Op<"StackCloseV2", []> {
   let summary = "Delete the stack from its resource container.";
 
-  let description = [{
-  }];
-
   let arguments = (ins
     TF_ResourceTensor:$handle
   );
@@ -8963,9 +8956,6 @@ def TF_StackCloseV2Op : TF_Op<"StackCloseV2", []> {
 def TF_StackPopV2Op : TF_Op<"StackPopV2", []> {
   let summary = "Pop the element at the top of the stack.";
 
-  let description = [{
-  }];
-
   let arguments = (ins
     TF_ResourceTensor:$handle
   );
@@ -8980,9 +8970,6 @@ def TF_StackPopV2Op : TF_Op<"StackPopV2", []> {
 def TF_StackPushV2Op : TF_Op<"StackPushV2", []> {
   let summary = "Push an element onto the stack.";
 
-  let description = [{
-  }];
-
   let arguments = (ins
     TF_ResourceTensor:$handle,
     TF_Tensor:$elem,
@@ -9000,9 +8987,6 @@ def TF_StackPushV2Op : TF_Op<"StackPushV2", []> {
 def TF_StackV2Op : TF_Op<"StackV2", []> {
   let summary = "A stack that produces elements in first-in last-out order.";
 
-  let description = [{
-  }];
-
   let arguments = (ins
     I32Tensor:$max_size,
 
@@ -9015,6 +8999,32 @@ def TF_StackV2Op : TF_Op<"StackV2", []> {
   );
 }
 
+def TF_StatelessRandomUniformOp : TF_Op<"StatelessRandomUniform", [NoSideEffect]> {
+  let summary = [{
+Outputs deterministic pseudorandom random values from a uniform distribution.
+  }];
+
+  let description = [{
+The generated values follow a uniform distribution in the range `[0, 1)`. The
+lower bound 0 is included in the range, while the upper bound 1 is excluded.
+
+The outputs are a deterministic function of `shape` and `seed`.
+  }];
+
+  let arguments = (ins
+    TF_I32OrI64Tensor:$shape,
+    TF_I32OrI64Tensor:$seed
+  );
+
+  let results = (outs
+    TF_FpTensor:$output
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+  TF_DerivedOperandTypeAttr Tseed = TF_DerivedOperandTypeAttr<1>;
+  TF_DerivedResultTypeAttr dtype = TF_DerivedResultTypeAttr<0>;
+}
+
 def TF_StopGradientOp : TF_Op<"StopGradient", [NoSideEffect, TF_AllTypesMatch<["input", "output"]>]> {
   let summary = "Stops gradient computation.";
 
@@ -9508,11 +9518,11 @@ Given an input tensor, this function computes tangent of every
   }];
 
   let arguments = (ins
-    TensorOf<[BF16, F16, F32, F64, I32, I64, TF_Complex128, TF_Complex64]>:$x
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64]>:$x
   );
 
   let results = (outs
-    TensorOf<[BF16, F16, F32, F64, I32, I64, TF_Complex128, TF_Complex64]>:$y
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64]>:$y
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -9694,9 +9704,6 @@ calculation gets its own TensorArray accumulator.
 def TF_TensorArrayReadV3Op : TF_Op<"TensorArrayReadV3", []> {
   let summary = "Read an element from the TensorArray into output `value`.";
 
-  let description = [{
-  }];
-
   let arguments = (ins
     TF_ResourceTensor:$handle,
     I32Tensor:$index,
@@ -9736,9 +9743,6 @@ Scatter the data from the input value into specific TensorArray elements.
 def TF_TensorArraySizeV3Op : TF_Op<"TensorArraySizeV3", []> {
   let summary = "Get the current size of the TensorArray.";
 
-  let description = [{
-  }];
-
   let arguments = (ins
     TF_ResourceTensor:$handle,
     F32Tensor:$flow_in
@@ -9815,9 +9819,6 @@ Write data via Write and read via Read or Pack.
 def TF_TensorArrayWriteV3Op : TF_Op<"TensorArrayWriteV3", []> {
   let summary = "Push an element onto the tensor_array.";
 
-  let description = [{
-  }];
-
   let arguments = (ins
     TF_ResourceTensor:$handle,
     I32Tensor:$index,
@@ -9938,9 +9939,6 @@ values: The tensor.
 def TF_TensorListGetItemOp : TF_Op<"TensorListGetItem", [NoSideEffect]> {
   let summary = "";
 
-  let description = [{
-  }];
-
   let arguments = (ins
     TF_VariantTensor:$input_handle,
     I32Tensor:$index,
@@ -10070,9 +10068,6 @@ output_handle: The TensorList.
 def TF_TensorListSetItemOp : TF_Op<"TensorListSetItem", [NoSideEffect]> {
   let summary = "";
 
-  let description = [{
-  }];
-
   let arguments = (ins
     TF_VariantTensor:$input_handle,
     I32Tensor:$index,
@@ -10862,9 +10857,6 @@ def TF_XdivyOp : TF_Op<"Xdivy", [NoSideEffect, ResultsBroadcastableShape]>,
                  WithBroadcastableBinOpBuilder {
   let summary = "Returns 0 if x == 0, and x / y otherwise, elementwise.";
 
-  let description = [{
-  }];
-
   let arguments = (ins
     TensorOf<[F16, F32, F64, TF_Complex128, TF_Complex64]>:$x,
     TensorOf<[F16, F32, F64, TF_Complex128, TF_Complex64]>:$y
@@ -11041,9 +11033,6 @@ def TF_XlaHostComputeOp : TF_Op<"XlaHostCompute", []> {
 A pseudo-op to represent host-side computation in an XLA program.
   }];
 
-  let description = [{
-  }];
-
   let arguments = (ins
     Variadic<TF_Tensor>:$inputs,
 
@@ -11114,9 +11103,6 @@ https://www.tensorflow.org/performance/xla/operation_semantics#pad
 def TF_XlaRecvFromHostOp : TF_Op<"XlaRecvFromHost", []> {
   let summary = "An op to receive a tensor from the host.";
 
-  let description = [{
-  }];
-
   let arguments = (ins
     TF_ShapeAttr:$shape,
     StrAttr:$key
@@ -11154,9 +11140,6 @@ https://www.tensorflow.org/performance/xla/operation_semantics#reduce .
 def TF_XlaReplicaIdOp : TF_Op<"XlaReplicaId", [NoSideEffect]> {
   let summary = "Replica ID.";
 
-  let description = [{
-  }];
-
   let arguments = (ins);
 
   let results = (outs
@@ -11196,9 +11179,6 @@ i=0...N-1.
 def TF_XlaSendToHostOp : TF_Op<"XlaSendToHost", []> {
   let summary = "An op to send a tensor to the host.";
 
-  let description = [{
-  }];
-
   let arguments = (ins
     TF_Tensor:$input,
 
@@ -11242,9 +11222,6 @@ tensor such that tensor[...,:,:] = u[..., :, :] * Diag(s[..., :]) * Transpose(v[
 def TF_Xlog1pyOp : TF_Op<"Xlog1py", [NoSideEffect]> {
   let summary = "Returns 0 if x == 0, and x * log1p(y) otherwise, elementwise.";
 
-  let description = [{
-  }];
-
   let arguments = (ins
     TensorOf<[F16, F32, F64, TF_Complex128, TF_Complex64]>:$x,
     TensorOf<[F16, F32, F64, TF_Complex128, TF_Complex64]>:$y
@@ -11261,9 +11238,6 @@ def TF_XlogyOp : TF_Op<"Xlogy", [NoSideEffect, ResultsBroadcastableShape]>,
                  WithBroadcastableBinOpBuilder {
   let summary = "Returns 0 if x == 0, and x * log(y) otherwise, elementwise.";
 
-  let description = [{
-  }];
-
   let arguments = (ins
     TensorOf<[F16, F32, F64, TF_Complex128, TF_Complex64]>:$x,
     TensorOf<[F16, F32, F64, TF_Complex128, TF_Complex64]>:$y
@@ -11279,9 +11253,6 @@ def TF_XlogyOp : TF_Op<"Xlogy", [NoSideEffect, ResultsBroadcastableShape]>,
 def TF_ZerosLikeOp : TF_Op<"ZerosLike", [NoSideEffect, SameOperandsAndResultType]> {
   let summary = "Returns a tensor of zeros with the same shape and type as x.";
 
-  let description = [{
-  }];
-
   let arguments = (ins
     TF_Tensor:$x
   );
@@ -11388,9 +11359,6 @@ expected to create these operators.
 def TF__HostComputeMlirOp : TF_Op<"_HostComputeMlir", []> {
   let summary = "A host-side computation called from a TPU device.";
 
-  let description = [{
-  }];
-
   let arguments = (ins
     Variadic<TF_Tensor>:$inputs,
 
@@ -11470,9 +11438,6 @@ def TF__XlaRecvAtHostOp : TF_Op<"_XlaRecvAtHost", []> {
 A placeholder op to receive values from a running XLA computation.
   }];
 
-  let description = [{
-  }];
-
   let arguments = (ins
     TF_StrTensor:$dynamic_key,
 
@@ -11490,9 +11455,6 @@ A placeholder op to receive values from a running XLA computation.
 def TF__XlaSendFromHostOp : TF_Op<"_XlaSendFromHost", []> {
   let summary = "A placeholder op to send values to a running XLA computation.";
 
-  let description = [{
-  }];
-
   let arguments = (ins
     Variadic<TF_Tensor>:$inputs,
     TF_StrTensor:$dynamic_key,
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td
index d8675bb786f..f5d8fbae46a 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td
@@ -232,6 +232,7 @@ else_branch: A function that takes 'inputs' and returns a list of
 
 def TF_YieldOp : TF_Op<"Yield", [Terminator]> {
   let summary = "Yield operation";
+
   let description = [{
     The "yield" operation represents a return operation within the conditional
     and body of structured control flow (e.g., if and while). The operation
@@ -497,6 +498,7 @@ Inserts a placeholder for a tensor that will be always fed.
 
 def TF_PlaceholderWithDefaultOp : TF_Op<"PlaceholderWithDefault", [NoSideEffect]> {
   let summary = "Placeholder op";
+
   let description = [{
     A placeholder op that passes through input when its output is not fed.
   }];
@@ -839,9 +841,6 @@ def TF_XlaShardingOp : TF_Op<"XlaSharding", [NoSideEffect]> {
 An op which shards the input based on the given sharding attribute.
   }];
 
-  let description = [{
-  }];
-
   let arguments = (ins
     TF_Tensor:$input,
 
@@ -858,9 +857,6 @@ An op which shards the input based on the given sharding attribute.
 def TF_InfeedDequeueTupleOp : TF_Op<"InfeedDequeueTuple", []> {
   let summary = "Fetches multiple values from infeed as an XLA tuple.";
 
-  let description = [{
-  }];
-
   let arguments = (ins
     OptionalAttr<StrAttr>:$_XlaSharding
   );
@@ -904,9 +900,6 @@ def TF_BatchDatasetV2Op : TF_Op<"BatchDatasetV2", [NoSideEffect]> {
 Creates a dataset that batches `batch_size` elements from `input_dataset`.
   }];
 
-  let description = [{
-  }];
-
   let arguments = (ins
     TF_VariantTensor:$input_dataset,
     I64Tensor:$batch_size,
@@ -1048,4 +1041,46 @@ operation create / operate on a copy of `x`.
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 }
 
+def TF_BesselI0eOp : TF_Op<"BesselI0e", [NoSideEffect, SameOperandsAndResultType]> {
+  let summary = "Computes the Bessel i0e function of `x` element-wise.";
+
+  let description = [{
+Exponentially scaled modified Bessel function of order 0 defined as
+`bessel_i0e(x) = exp(-abs(x)) bessel_i0(x)`.
+
+This function is faster and numerically stabler than `bessel_i0(x)`.
+  }];
+
+  let arguments = (ins
+    TF_FpTensor:$x
+  );
+
+  let results = (outs
+    TF_FpTensor:$y
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+}
+
+def TF_BesselI1eOp : TF_Op<"BesselI1e", [NoSideEffect, SameOperandsAndResultType]> {
+  let summary = "Computes the Bessel i1e function of `x` element-wise.";
+
+  let description = [{
+Exponentially scaled modified Bessel function of order 0 defined as
+`bessel_i1e(x) = exp(-abs(x)) bessel_i1(x)`.
+
+This function is faster and numerically stabler than `bessel_i1(x)`.
+  }];
+
+  let arguments = (ins
+    TF_FpTensor:$x
+  );
+
+  let results = (outs
+    TF_FpTensor:$y
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+}
+
 #endif // TF_OPS
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.cc
index 140a778770c..5a7d81d4c0c 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "mlir/IR/Identifier.h"  // from @llvm-project
 #include "mlir/IR/Module.h"  // from @llvm-project
 #include "mlir/IR/OpImplementation.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
 #include "mlir/IR/StandardTypes.h"  // from @llvm-project
 #include "mlir/IR/SymbolTable.h"  // from @llvm-project
 #include "mlir/IR/TypeUtilities.h"  // from @llvm-project
@@ -76,6 +77,23 @@ static LogicalResult Verify(GlobalTensorOp global_tensor) {
   return success();
 }
 
+static LogicalResult Verify(SessionInitializerOp session_initializer) {
+  mlir::SymbolTable symbol_table(
+      session_initializer.getParentOfType<ModuleOp>());
+
+  auto init_func_op =
+      symbol_table.lookup<mlir::FuncOp>(session_initializer.initializer());
+  if (!init_func_op)
+    return session_initializer.emitOpError()
+           << "the initializer function does not exist";
+
+  if (!init_func_op.getType().getResults().empty())
+    return session_initializer.emitOpError()
+           << "the initializer function should have no output";
+
+  return success();
+}
+
 #define GET_OP_CLASSES
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.cc.inc"
 
@@ -212,14 +230,36 @@ static LogicalResult VerifySavedModelModule(
     }
   }
   for (auto func : module.getOps<FuncOp>()) {
-    if (HasAnyTfSavedModelArgAttr(func)) {
-      if (!IsExported(func)) {
-        return func.emitError()
-               << "can only apply 'tf_saved_model' argument attributes "
-                  "to exported functions";
-      }
+    const bool is_exported = IsExported(func);
+
+    if (is_exported && !func.isPublic()) {
+      return func.emitError()
+             << "exported function @" << func.getName() << " should be public";
+    }
+
+    if (!is_exported && func.isPublic()) {
+      return func.emitError() << "non-exported function @" << func.getName()
+                              << " should be private";
+    }
+
+    if (!is_exported && HasAnyTfSavedModelArgAttr(func)) {
+      return func.emitError() << "can only apply 'tf_saved_model' argument "
+                                 "attributes to exported functions";
     }
   }
+
+  auto session_initializers = module.getOps<SessionInitializerOp>();
+  if (!session_initializers.empty() &&
+      !llvm::hasSingleElement(session_initializers)) {
+    return (*++session_initializers.begin()).emitError()
+           << "there must be no more than one session_initializer op";
+  }
+
+  auto is_init = [&session_initializers](mlir::FuncOp func) {
+    if (session_initializers.empty()) return false;
+    return (*session_initializers.begin()).initializer() == func.getName();
+  };
+
   SymbolTable symbol_table(module);
   auto symbol_uses = SymbolTable::getSymbolUses(&module.getBodyRegion());
   if (!symbol_uses.hasValue()) {
@@ -230,6 +270,12 @@ static LogicalResult VerifySavedModelModule(
     auto func = symbol_table.lookup<FuncOp>(
         symbol_use.getSymbolRef().cast<FlatSymbolRefAttr>().getValue());
     if (func && IsExported(func)) {
+      // If it is an init function, then it can be used by the unique
+      // session_initializer op.
+      if (is_init(func) &&
+          llvm::isa<SessionInitializerOp>(symbol_use.getUser()))
+        continue;
+
       return symbol_use.getUser()
           ->emitError("exported function cannot be internally referenced")
           .attachNote(func.getLoc())
@@ -349,5 +395,39 @@ GlobalTensorOp LookupBoundInput(FuncOp func, int arg_index,
   return symbol_table.lookup<GlobalTensorOp>(attr.getValue());
 }
 
+SessionInitializerOp GetSessionInitializerOp(mlir::ModuleOp op) {
+  auto initializers = op.getOps<SessionInitializerOp>();
+  if (initializers.empty()) return {};
+  return *initializers.begin();
+}
+
+class OptimizeSessionInitializerPattern
+    : public OpRewritePattern<SessionInitializerOp> {
+ public:
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(SessionInitializerOp op,
+                                PatternRewriter &rewriter) const override {
+    SymbolTable symbol_table(op.getParentOfType<ModuleOp>());
+    auto init_func_op = symbol_table.lookup<mlir::FuncOp>(op.initializer());
+
+    // The init function can only be referenced from the SessionInitializerOp.
+    // And there is at most one SessionInitializerOp in the module. So both ops
+    // have no other uses and can be simply erased.
+    if (init_func_op.front().begin()->isKnownTerminator()) {
+      rewriter.eraseOp(init_func_op);
+      rewriter.eraseOp(op);
+      return success();
+    }
+
+    return failure();
+  }
+};
+
+void SessionInitializerOp::getCanonicalizationPatterns(
+    OwningRewritePatternList &results, MLIRContext *context) {
+  results.insert<OptimizeSessionInitializerPattern>(context);
+}
+
 }  // namespace tf_saved_model
 }  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.h b/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.h
index 47ebb1a1be5..b6f8753cc51 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.h
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.h
@@ -61,6 +61,10 @@ GlobalTensorOp LookupBoundInput(FuncOp func, int arg_index,
 // should have.
 Type GetBoundInputArgTypeFor(GlobalTensorOp global_tensor);
 
+// Returns the session initializer of this module if it exists. Returns null
+// otherwise.
+SessionInitializerOp GetSessionInitializerOp(mlir::ModuleOp op);
+
 }  // namespace tf_saved_model
 }  // namespace mlir
 
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model_ops.td
index 4431a160edf..dc1210a4d2a 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model_ops.td
@@ -128,4 +128,30 @@ def TfSavedModel_GlobalTensorOp : TfSavedModel_Op<"global_tensor"> {
   let verifier = [{ return Verify(*this); }];
 }
 
+def TfSavedModel_SessionInitializerOp: TfSavedModel_Op<"session_initializer"> {
+  let summary = "Initializes TensorFlow session state.";
+  let description = [{
+    The session initializer op marks a function that must be called by an
+    external agent exactly once to initialize TensorFlow session state, and this
+    must happen before any other exported functions are called. There must be no
+    more than one session initializer in a saved model.
+
+    The `initializer` represents the initialization function. The function have
+    no output and this function should be only called once.
+
+    This is used, for example, to initialize hash tables stored in resources and
+    accessed by resource name (rather than as resource handles or bound inputs
+    which is how `global_tensor`s are referenced)
+  }];
+
+  let arguments = (ins
+    FlatSymbolRefAttr:$initializer
+  );
+
+
+  let verifier = [{ return Verify(*this); }];
+
+  let hasCanonicalizer = 1;
+}
+
 #endif // SAVED_MODEL_DIALECT
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/function_visibility.mlir b/tensorflow/compiler/mlir/tensorflow/tests/function_visibility.mlir
deleted file mode 100644
index 55af3cffde3..00000000000
--- a/tensorflow/compiler/mlir/tensorflow/tests/function_visibility.mlir
+++ /dev/null
@@ -1,47 +0,0 @@
-// RUN: tf-opt -tf-saved-model-mark-func-visibility -split-input-file %s | FileCheck --check-prefix=SAVEDMODEL %s
-// RUN: tf-opt -tf-mark-func-visibility -split-input-file -verify-diagnostics %s | FileCheck %s
-
-
-module attributes {tf_saved_model.semantics} {
-  // SAVEDMODEL: func @func_exported_1() attributes {tf_saved_model.exported_names = ["func_exported_1"]}
-  func @func_exported_1() attributes {tf_saved_model.exported_names = ["func_exported_1"]} {
-    "tf.some_call"() {callee = {callee = {callee = @child}}} : () -> ()
-    return
-  }
-
-  // SAVEDMODEL: func @func_exported_2() attributes {tf_saved_model.exported_names = ["func_exported_2"]}
-  func @func_exported_2() attributes {tf_saved_model.exported_names = ["func_exported_2"]} {
-    "tf.some_call"() {callee = {callee = {callee = @child}}} : () -> ()
-    return
-  }
-
-  // SAVEDMODEL: func @func_not_exported() attributes {sym_visibility = "private"}
-  func @func_not_exported() {
-    return
-  }
-
-}
-
-// -----
-
-module {
-  // CHECK: func @func_with_entry_spec(%arg0: tensor<1xi32>) -> tensor<1xi32> attributes {tf.entry_function = {inputs = "x", outputs = "y"}}
-  func @func_with_entry_spec(%arg0: tensor<1xi32>) -> tensor<1xi32> attributes {tf.entry_function = {inputs = "x", outputs = "y"}} {
-    return %arg0 : tensor<1xi32>
-  }
-
-  // CHECK: func @func_without_entry_spec(%arg0: tensor<*xi32>, %arg1: tensor<*xi32>) -> tensor<*xi32> attributes {sym_visibility = "private"}
-  func @func_without_entry_spec(%arg0: tensor<*xi32>, %arg1: tensor<*xi32>) -> tensor<*xi32> {
-    %0 = "tf.AddV2"(%arg0, %arg1) {T = i32, device = ""} : (tensor<*xi32>, tensor<*xi32>) -> tensor<*xi32>
-    return %0 : tensor<*xi32>
-  }
-}
-
-// -----
-
-module {
-  // expected-error @+1 {{can't overwrite the visibility of function private_func_with_entry_spec with private visibility}}
-  func @private_func_with_entry_spec(%arg0: tensor<1xi32>) -> tensor<1xi32> attributes {tf.entry_function = {inputs = "x", outputs = "y"}, sym_visibility = "private"} {
-    return %arg0 : tensor<1xi32>
-  }
-}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/shape_inference.mlir b/tensorflow/compiler/mlir/tensorflow/tests/shape_inference.mlir
index 1599d53ed15..1af4ba6b3dc 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/shape_inference.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/shape_inference.mlir
@@ -433,4 +433,17 @@ func @multiple_blocks_one_return(%arg0: tensor<?xf32>) -> tensor<*xf32> {
     // CHECK: return %[[CAST_RESULT_0]], %[[CAST_RESULT_1]], %[[ADDI]]
     return %27, %28, %2 : tensor<*xui8>, tensor<*xi8>, tensor<*xi8>
   }
+
+  // CHECK-LABEL: infer_device_launch
+  func @infer_device_launch(%arg0: tensor<1x8x2xi32>) -> (tensor<*xf32>, tensor<*xf32>) {
+    %0 = "tf.Const"() {value = dense<-1> : tensor<i32>} : () -> tensor<i32>
+    %1 = "tf_device.launch"() ({
+      %2 = "tf.Cast"(%arg0) {Truncate = false} : (tensor<1x8x2xi32>) -> tensor<1x8x2xf32>
+      tf_device.return %2 : tensor<1x8x2xf32>
+    // CHECK: () -> tensor<1x8x2xf32>
+    }) {device = "/device:CPU:0"} : () -> tensor<*xf32>
+    // CHECK: (tensor<i32>, tensor<1x8x2xf32>) -> (tensor<1x8x1xf32>, tensor<1x8x1xf32>)
+    %3:2 = "tf.Split"(%0, %1) {device = ""} : (tensor<i32>, tensor<*xf32>) -> (tensor<*xf32>, tensor<*xf32>)
+    return %3#0, %3#1 : tensor<*xf32>, tensor<*xf32>
+  }
 }
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/build_defs.bzl b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/build_defs.bzl
index 594afa10453..95ad05aa1e6 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/build_defs.bzl
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/build_defs.bzl
@@ -4,8 +4,6 @@ load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "lit_test")
 
 def tf_saved_model_test(name, data, tags = None):
     """Create a SavedModel test."""
-    if tags == None:
-        tags = ["no_rocm"]
     native.py_binary(
         name = name,
         testonly = 1,
@@ -26,5 +24,5 @@ def tf_saved_model_test(name, data, tags = None):
         name = name + ".py",
         data = [name] + data,
         driver = "@llvm-project//mlir:run_lit.sh",
-        tags = tags,
+        tags = tags + ["no_rocm"],
     )
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/common_v1.py b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/common_v1.py
index 7171f63bb05..5bfcfa5378a 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/common_v1.py
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/common_v1.py
@@ -46,7 +46,10 @@ def set_tf_options():
 # This function needs to take a "create_module_fn", as opposed to just the
 # module itself, because the creation of the module has to be delayed until
 # after absl and tensorflow have run various initialization steps.
-def do_test(signature_def_map, show_debug_info=False):
+def do_test(signature_def_map,
+            init_op=None,
+            canonicalize=False,
+            show_debug_info=False):
   """Runs test.
 
   1. Performs absl and tf "main"-like initialization that must run before almost
@@ -61,6 +64,9 @@ def do_test(signature_def_map, show_debug_info=False):
   Args:
     signature_def_map: A map from string key to signature_def. The key will be
       used as function name in the resulting MLIR.
+    init_op: The initializer op for the saved model. If set, it will generate a
+      initializer graph in the resulting MLIR.
+    canonicalize: If true, canonicalizer will be run on the resulting MLIR.
     show_debug_info: If true, shows debug locations in the resulting MLIR.
   """
 
@@ -84,6 +90,7 @@ def do_test(signature_def_map, show_debug_info=False):
     builder.add_meta_graph_and_variables(
         sess, [tf.saved_model.tag_constants.SERVING],
         signature_def_map,
+        main_op=init_op,
         strip_default_attrs=True)
     builder.save()
 
@@ -97,6 +104,9 @@ def do_test(signature_def_map, show_debug_info=False):
     mlir = pywrap_mlir.experimental_run_pass_pipeline(mlir,
                                                       'tf-standard-pipeline',
                                                       show_debug_info)
+    if canonicalize:
+      mlir = pywrap_mlir.experimental_run_pass_pipeline(mlir, 'canonicalize',
+                                                        show_debug_info)
     print(mlir)
 
   app.run(app_main)
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/hash_table_v1.py b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/hash_table_v1.py
new file mode 100644
index 00000000000..16290455608
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/hash_table_v1.py
@@ -0,0 +1,92 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+# RUN: %p/hash_table_v1 | FileCheck %s
+
+# pylint: disable=missing-docstring,line-too-long
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow.compat.v1 as tf
+from tensorflow.compiler.mlir.tensorflow.tests.tf_saved_model import common_v1
+
+# Verify that the tf.versions attribute exists. It is difficult to enforce
+# contents, since the version numbers change over time. The conversion logic
+# itself is verified in the common graphdef converter, so here just assert
+# it is being invoked.
+# CHECK: module
+# CHECK-SAME: tf.versions
+# CHECK-SAME: bad_consumers
+# CHECK-SAME: min_consumer
+# CHECK-SAME: producer
+
+# CHECK: "tf_saved_model.session_initializer"() {initializer = [[init:@.*]]} : () -> ()
+# CHECK: "tf_saved_model.global_tensor"()
+
+# CHECK:      func [[init]]
+# CHECK-NEXT: [[R5:%.*]] = "tf.Const"()
+# CHECK-NEXT: [[R6:%.*]] = "tf.Const"()
+# CHECK-NEXT: [[R7:%.*]] = "tf.HashTableV2"()
+# CHECK-SAME: shared_name = "[[hash_table:.*]]"
+# CHECK-NEXT: "tf.LookupTableImportV2"([[R7]], [[R5]], [[R6]])
+
+# CHECK:      func {{@[a-zA-Z_0-9]+}}(
+# CHECK-SAME: [[ARG0:%.*]]: tensor<i32>
+# CHECK-SAME: [[ARG1:%.*]]: tensor<!tf.resource
+# CHECK-SAME: attributes {{.*}} tf_saved_model.exported_names = ["key"]
+
+# CHECK-NEXT: [[R0:%.*]] = "tf.Const"()
+# CHECK-NEXT: [[R1:%.*]] = "tf.HashTableV2"()
+# CHECK-SAME: shared_name = "[[hash_table]]"
+# CHECK-NEXT: [[R2:%.*]] = "tf.LookupTableFindV2"([[R1]], [[ARG0]], [[R0]])
+# CHECK-NEXT: [[R3:%.*]] = "tf.ReadVariableOp"([[ARG1]])
+# CHECK-NEXT: [[R4:%.*]] = "tf.AddV2"([[R2]], [[R3]])
+# CHECK-NEXT: return [[R4]]
+
+
+def Test():
+
+  z = tf.compat.v1.get_variable(
+      name='y',
+      shape=(),
+      initializer=tf.random_normal_initializer(),
+      trainable=True)
+  table_initializer = tf.lookup.KeyValueTensorInitializer(
+      keys=[1, 2, 3, 4],
+      values=[5, 6, 7, 8],
+      key_dtype=tf.int32,
+      value_dtype=tf.float32)
+  table = tf.lookup.StaticHashTable(
+      table_initializer, default_value=tf.constant(0.0))
+
+  x = tf.placeholder(tf.int32, shape=(), name='input')
+  y = table.lookup(x)
+  r = tf.add(y, z)
+
+  tensor_info_x = tf.compat.v1.saved_model.utils.build_tensor_info(x)
+  tensor_info_r = tf.compat.v1.saved_model.utils.build_tensor_info(r)
+
+  return {
+      'key': (tf.compat.v1.saved_model.signature_def_utils.build_signature_def(
+          inputs={'x': tensor_info_x},
+          outputs={'r': tensor_info_r},
+          method_name='some_function'))
+  }
+
+
+if __name__ == '__main__':
+  common_v1.set_tf_options()
+  common_v1.do_test(Test(), tf.tables_initializer())
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/remove_init_variable_v1.py b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/remove_init_variable_v1.py
new file mode 100644
index 00000000000..117132649d7
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/remove_init_variable_v1.py
@@ -0,0 +1,74 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+# RUN: %p/remove_init_variable_v1 | FileCheck %s
+
+# pylint: disable=missing-docstring,line-too-long
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow.compat.v1 as tf
+from tensorflow.compiler.mlir.tensorflow.tests.tf_saved_model import common_v1
+
+# Verify that the tf.versions attribute exists. It is difficult to enforce
+# contents, since the version numbers change over time. The conversion logic
+# itself is verified in the common graphdef converter, so here just assert
+# it is being invoked.
+# CHECK: module
+# CHECK-SAME: tf.versions
+# CHECK-SAME: bad_consumers
+# CHECK-SAME: min_consumer
+# CHECK-SAME: producer
+
+# CHECK: "tf_saved_model.global_tensor"() {is_mutable, sym_name = "[[VAR:[a-zA-Z_0-9]+]]", type = tensor<1x3xf32>, value = {{.*}} : tensor<1x3xf32>} : () -> ()
+# CHECK-NOT: session_initializer
+
+# CHECK:      func {{@[a-zA-Z_0-9]+}}(
+# CHECK-SAME:   [[ARG0:%.*]]: tensor<3x1xf32> {tf_saved_model.index_path = ["x"]},
+# CHECK-SAME:   [[ARG1:%.*]]: tensor<!tf.resource<tensor<1x3xf32>>> {tf_saved_model.bound_input = @[[VAR]]})
+# CHECK-SAME:             -> (tensor<3x3xf32> {tf_saved_model.index_path = ["r"]})
+# CHECK-SAME: attributes {{.*}} tf_saved_model.exported_names = ["key"]
+
+# CHECK-NEXT: [[R0:%.*]] = "tf.ReadVariableOp"([[ARG1]]) {{{.*}}} : (tensor<!tf.resource<tensor<1x3xf32>>>) -> tensor<1x3xf32>
+# CHECK-NEXT: [[R1:%.*]] = "tf.MatMul"([[ARG0]], [[R0]]) {{{.*}}} : (tensor<3x1xf32>, tensor<1x3xf32>) -> tensor<3x3xf32>
+# CHECK-NEXT: return [[R1]] : tensor<3x3xf32>
+
+
+def Test():
+
+  x = tf.constant([[1.0], [1.0], [1.0]])
+  y = tf.compat.v1.get_variable(
+      name='y',
+      shape=(1, 3),
+      initializer=tf.random_normal_initializer(),
+      trainable=True)
+  r = tf.matmul(x, y)
+
+  tensor_info_x = tf.compat.v1.saved_model.utils.build_tensor_info(x)
+  tensor_info_r = tf.compat.v1.saved_model.utils.build_tensor_info(r)
+
+  return {
+      'key': (tf.compat.v1.saved_model.signature_def_utils.build_signature_def(
+          inputs={'x': tensor_info_x},
+          outputs={'r': tensor_info_r},
+          method_name='some_function'))
+  }
+
+
+if __name__ == '__main__':
+  common_v1.set_tf_options()
+  common_v1.do_test(
+      Test(), tf.initializers.global_variables(), canonicalize=True)
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_delete_unused_funcs.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_delete_unused_funcs.mlir
deleted file mode 100644
index 6f2c47a935f..00000000000
--- a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_delete_unused_funcs.mlir
+++ /dev/null
@@ -1,96 +0,0 @@
-// RUN: tf-opt -tf-saved-model-mark-func-visibility -symbol-dce -split-input-file %s | FileCheck %s
-
-module attributes {tf_saved_model.semantics} {
-
-  // Test case: Unused function should be deleted.
-
-  // CHECK-NOT: func @unused
-  func @unused() {
-    return
-  }
-
-}
-
-// -----
-
-module attributes {tf_saved_model.semantics} {
-
-  // Test case: Root calls child. Child should not be deleted.
-
-  // CHECK: func @root
-  func @root() attributes {tf_saved_model.exported_names = ["root"]} {
-    "tf.some_call"() { callee = @child } : () -> ()
-    return
-  }
-
-  // CHECK: func @child
-  func @child() {
-    return
-  }
-
-}
-
-// -----
-
-module attributes {tf_saved_model.semantics} {
-
-  // Test case: Don't crash if attribute that doesn't reference a func.
-
-  "tf.some_opaque_global_variable"() { sym_name = "some_global" } : () -> ()
-
-  func @root2() attributes {tf_saved_model.exported_names = ["root2"]} {
-    "tf.do_something_with_a_global"() { global = @some_global } : () -> ()
-    return
-  }
-
-}
-
-// -----
-
-module attributes {tf_saved_model.semantics} {
-
-  // Test case: Delete recursively dead cycle.
-
-  // CHECK-NOT: func @recursively_dead0
-  func @recursively_dead0() {
-    "tf.some_call"() { callee = @recursively_dead1 } : () -> ()
-    return
-  }
-  // CHECK-NOT: func @recursively_dead1
-  func @recursively_dead1() {
-    "tf.some_call"() { callee = @recursively_dead0 } : () -> ()
-    return
-  }
-
-}
-
-// -----
-
-module attributes {tf_saved_model.semantics} {
-
-  // Test case: Root calls child with a deeply nested symbol reference.
-  // Child should not be deleted.
-
-  // CHECK: func @root
-  func @root() attributes {tf_saved_model.exported_names = ["root"]} {
-    "tf.some_call"() {callee = {callee = {callee = @child}}} : () -> ()
-    return
-  }
-
-  // CHECK: func @child
-  func @child() {
-    return
-  }
-
-}
-
-// -----
-
-// Test case: If the module doesn't have tf_saved_model semantics, then this
-// pass shouldn't do anything.
-module {
-  // CHECK: func @not_dead()
-  func @not_dead() {
-    return
-  }
-}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_freeze_global_tensors.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_freeze_global_tensors.mlir
index 38627b41b68..6c32a3bc4d6 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_freeze_global_tensors.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_freeze_global_tensors.mlir
@@ -64,7 +64,7 @@ module attributes {tf_saved_model.semantics} {
     return
   }
 
-  func @f_callee(%arg0: tensor<!tf.resource<tensor<f32>>>) {
+  func @f_callee(%arg0: tensor<!tf.resource<tensor<f32>>>) attributes {sym_visibility = "private"} {
     return
   }
 }
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_ops.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_ops.mlir
index 21e3bef8fd8..26cdf025a10 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_ops.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_ops.mlir
@@ -2,6 +2,11 @@
 
 module attributes {tf_saved_model.semantics} {
 
+  // CHECK: tf_saved_model.session_initializer
+  "tf_saved_model.session_initializer"() {
+    initializer = @init
+  } : () -> ()
+
   // Representation for constants: (immutable) global tensor.
   // CHECK: tf_saved_model.global_tensor
   "tf_saved_model.global_tensor"() {
@@ -35,7 +40,18 @@ module attributes {tf_saved_model.semantics} {
     return %arg0 : tensor<f32>
   }
 
-  func @f() {
+  func @f() attributes {sym_visibility = "private"} {
+    return
+  }
+
+  // Representation for init functions
+  // CHECK: func @init
+  // CHECK-SAME: exported_names = ["__tf_saved_model_session_initializer"]
+  func @init(
+    %arg1: tensor<!tf.resource<tensor<1x64xf32>>> {tf_saved_model.bound_input = @some_constant}
+  ) attributes {tf_saved_model.exported_names = ["__tf_saved_model_session_initializer"]}
+  {
+    "tf.some_call"(%arg1) : (tensor<!tf.resource<tensor<1x64xf32>>>) -> ()
     return
   }
 
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_ops_invalid.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_ops_invalid.mlir
index c055c6c9f56..260174b184f 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_ops_invalid.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_ops_invalid.mlir
@@ -3,7 +3,7 @@
 module attributes {tf_saved_model.semantics} {
 
   // expected-error@+1 {{unknown tf_saved_model dialect arg attribute 'tf_saved_model.not_a_real_arg_attr'}}
-  func @f(%arg0: tensor<f32> {tf_saved_model.not_a_real_arg_attr = 1 : i32}) {
+  func @f(%arg0: tensor<f32> {tf_saved_model.not_a_real_arg_attr = 1 : i32}) attributes {sym_visibility = "private"} {
     return
   }
 
@@ -233,7 +233,7 @@ module attributes {tf_saved_model.semantics} {
   "tf_saved_model.global_tensor"() { is_mutable, sym_name = "v", type = tensor<?xf32>, value = dense<1.> : tensor<1xf32> } : () -> ()
   // expected-error@+1 {{can only apply 'tf_saved_model' argument attributes to exported functions}}
   func @f(%arg0: tensor<!tf.resource<tensor<?xf32>>> {tf_saved_model.bound_input = @v})
-  -> (tensor<?xf32> {tf_saved_model.index_path = []}) {
+  -> (tensor<?xf32> {tf_saved_model.index_path = []}) attributes {sym_visibility = "private"} {
     %0 = "tf.ReadVariableOp"(%arg0) : (tensor<!tf.resource<tensor<?xf32>>>) -> tensor<?xf32>
     return %0 : tensor<?xf32>
   }
@@ -258,3 +258,97 @@ module attributes {tf_saved_model.semantics} {
   // expected-error@+1 {{'type' attribute for immutable 'tf_saved_model.global_tensor' should have a static shape}}
   "tf_saved_model.global_tensor"() { sym_name = "v", type = tensor<?xf32>, value = dense<1.> : tensor<1xf32> } : () -> ()
 }
+
+// -----
+
+module attributes {tf_saved_model.semantics} {
+
+  // expected-error@+1 {{the initializer function does not exist}}
+  "tf_saved_model.session_initializer"() { initializer = @init } : () -> ()
+}
+
+// -----
+
+module attributes {tf_saved_model.semantics} {
+
+  // expected-error@+1 {{the initializer function should have no output}}
+  "tf_saved_model.session_initializer"() { initializer = @init } : () -> ()
+  func @init() -> tensor<1xf32> attributes {sym_visibility = "private"} {
+    %0 = "tf.Const"() {value = dense<[1.0]> : tensor<1xf32> } : () -> tensor<1xf32>
+    return %0 : tensor<1xf32>
+  }
+}
+
+// -----
+
+module attributes {tf_saved_model.semantics} {
+
+  "tf_saved_model.session_initializer"() { initializer = @init } : () -> ()
+  // expected-error@+1 {{there must be no more than one session_initializer op}}
+  "tf_saved_model.session_initializer"() { initializer = @init } : () -> ()
+  func @init() -> tensor<1xf32> attributes {sym_visibility = "private"} {
+    %0 = "tf.Const"() {value = dense<[1.0]> : tensor<1xf32> } : () -> tensor<1xf32>
+    return %0 : tensor<1xf32>
+  }
+}
+
+// -----
+
+module attributes {tf_saved_model.semantics} {
+
+  // expected-error@+1 {{exported function @f should be public}}
+  func @f(
+    %arg0: tensor<f32> {tf.resource_name = "resource"}
+  ) attributes { sym_visibility = "private", tf_saved_model.exported_names = ["foo.some_func"] } {
+    return
+  }
+
+}
+
+// -----
+
+module attributes {tf_saved_model.semantics} {
+
+  // expected-error@+1 {{non-exported function @f should be private}}
+  func @f(
+    %arg0: tensor<f32> {tf.resource_name = "resource"}
+  ) {
+    return
+  }
+
+}
+
+// -----
+
+module attributes {tf_saved_model.semantics} {
+
+  // expected-error@+1 {{the initializer function does not exist}}
+  "tf_saved_model.session_initializer"() { initializer = @init } : () -> ()
+}
+
+// -----
+
+module attributes {tf_saved_model.semantics} {
+
+  // expected-error@+1 {{the initializer function should have no output}}
+  "tf_saved_model.session_initializer"() { initializer = @init } : () -> ()
+  func @init() -> (tensor<1xf32> {tf_saved_model.index_path = ["output"]})
+    attributes { tf_saved_model.exported_names = ["__tf_saved_model_session_initializer"] } {
+    %0 = "tf.Const"() {value = dense<[1.0]> : tensor<1xf32> } : () -> tensor<1xf32>
+    return %0 : tensor<1xf32>
+  }
+}
+
+// -----
+
+module attributes {tf_saved_model.semantics} {
+
+  "tf_saved_model.session_initializer"() { initializer = @init } : () -> ()
+  // expected-error@+1 {{there must be no more than one session_initializer op}}
+  "tf_saved_model.session_initializer"() { initializer = @init } : () -> ()
+  func @init() -> (tensor<1xf32> {tf_saved_model.index_path = ["output"]})
+    attributes { tf_saved_model.exported_names = ["__tf_saved_model_session_initializer"] } {
+    %0 = "tf.Const"() {value = dense<[1.0]> : tensor<1xf32> } : () -> tensor<1xf32>
+    return %0 : tensor<1xf32>
+  }
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_optimize_global_tensors.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_optimize_global_tensors.mlir
index 9d8911d306d..0c68cf0cf64 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_optimize_global_tensors.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_optimize_global_tensors.mlir
@@ -1,7 +1,7 @@
 // RUN: tf-opt -tf-saved-model-optimize-global-tensors -split-input-file %s | FileCheck %s
 
 //===----------------------------------------------------------------------===//
-// Freezing.
+// Immutability.
 //===----------------------------------------------------------------------===//
 
 module attributes {tf_saved_model.semantics} {
@@ -142,3 +142,89 @@ module attributes {tf_saved_model.semantics} {
 // Test running the pass on a module that does not have
 // tf_saved_model.semantics.
 module {}
+
+// -----
+
+// Test use as an input in unhandled op
+module attributes {tf_saved_model.semantics} {
+
+  // CHECK: "tf_saved_model.global_tensor"() {
+  // CHECK-SAME: is_mutable
+  // CHECK-SAME: } : () -> ()
+  "tf_saved_model.global_tensor"() { is_mutable, sym_name = "v", type = tensor<f32>, value = dense<42.> : tensor<f32> } : () -> ()
+
+  func @f(%arg0: tensor<!tf.resource<tensor<f32>>> {tf_saved_model.bound_input = @v})
+  attributes {tf_saved_model.exported_names = ["f"]} {
+    "tf.unhandled_op"(%arg0) : (tensor<!tf.resource<tensor<f32>>>) -> ()
+    return
+  }
+}
+
+
+// -----
+
+// Test use as a region capture in an unhandled op
+module attributes {tf_saved_model.semantics} {
+
+  // CHECK: "tf_saved_model.global_tensor"() {
+  // CHECK-SAME: is_mutable
+  // CHECK-SAME: } : () -> ()
+  "tf_saved_model.global_tensor"() { is_mutable, sym_name = "v", type = tensor<f32>, value = dense<42.> : tensor<f32> } : () -> ()
+
+  func @f(%arg0: tensor<!tf.resource<tensor<f32>>> {tf_saved_model.bound_input = @v})
+  attributes {tf_saved_model.exported_names = ["f"]} {
+    "tf.unhandled"() ({
+      %val = "tf.ReadVariableOp"(%arg0) : (tensor<!tf.resource<tensor<f32>>>) -> tensor<f32>
+      "tf.unhandled_terminator"() : () -> ()
+    }) : () -> ()
+    return
+  }
+}
+
+// -----
+
+// Test use as region capture as well as input in an unhandled op
+// to the unhandled op.
+module attributes {tf_saved_model.semantics} {
+
+  // CHECK: "tf_saved_model.global_tensor"() {
+  // CHECK-SAME: is_mutable
+  // CHECK-SAME: } : () -> ()
+  "tf_saved_model.global_tensor"() { is_mutable, sym_name = "v", type = tensor<f32>, value = dense<42.> : tensor<f32> } : () -> ()
+
+  // CHECK: "tf_saved_model.global_tensor"() {
+  // CHECK-SAME: is_mutable
+  // CHECK-SAME: } : () -> ()
+  "tf_saved_model.global_tensor"() { is_mutable, sym_name = "u", type = tensor<f32>, value = dense<22.> : tensor<f32> } : () -> ()
+
+  func @f(%arg0: tensor<!tf.resource<tensor<f32>>> {tf_saved_model.bound_input = @v}, %arg1: tensor<!tf.resource<tensor<f32>>> {tf_saved_model.bound_input = @u})
+  attributes {tf_saved_model.exported_names = ["f"]} {
+    %0 = "tf.unhandled"(%arg0) ({
+      %val = "tf.ReadVariableOp"(%arg1) : (tensor<!tf.resource<tensor<f32>>>) -> tensor<f32>
+      "tf.unhandled_terminator"() : () -> ()
+    }) : (tensor<!tf.resource<tensor<f32>>>) -> (tensor<!tf.resource<tensor<f32>>>)
+    return
+  }
+}
+
+// -----
+
+// Test multiple global tensors uses as operands for an unhandled op.
+module attributes {tf_saved_model.semantics} {
+
+  // CHECK: "tf_saved_model.global_tensor"() {
+  // CHECK-SAME: is_mutable
+  // CHECK-SAME: } : () -> ()
+  "tf_saved_model.global_tensor"() { is_mutable, sym_name = "v", type = tensor<f32>, value = dense<42.> : tensor<f32> } : () -> ()
+
+  // CHECK: "tf_saved_model.global_tensor"() {
+  // CHECK-SAME: is_mutable
+  // CHECK-SAME: } : () -> ()
+  "tf_saved_model.global_tensor"() { is_mutable, sym_name = "u", type = tensor<f32>, value = dense<22.> : tensor<f32> } : () -> ()
+
+  func @f(%arg0: tensor<!tf.resource<tensor<f32>>> {tf_saved_model.bound_input = @v}, %arg1: tensor<!tf.resource<tensor<f32>>> {tf_saved_model.bound_input = @u})
+  attributes {tf_saved_model.exported_names = ["f"]} {
+    "tf.unhandled"(%arg0, %arg1) : (tensor<!tf.resource<tensor<f32>>>, tensor<!tf.resource<tensor<f32>>>) -> ()
+    return
+  }
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_optimize_global_tensors_interprocedural.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_optimize_global_tensors_interprocedural.mlir
index 91e8c9c4b66..14a0006cd3b 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_optimize_global_tensors_interprocedural.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_optimize_global_tensors_interprocedural.mlir
@@ -20,12 +20,12 @@ module attributes {tf_saved_model.semantics} {
     return %val : tensor<f32>
   }
 
-  func @f_callee(%arg0: tensor<*x!tf.resource>) -> tensor<f32> {
+  func @f_callee(%arg0: tensor<*x!tf.resource>) -> tensor<f32> attributes {sym_visibility = "private"} {
     %val = "tf.PartitionedCall"(%arg0) {config = "", config_proto = "", executor_type = "", f = @f_callee_callee} : (tensor<*x!tf.resource>) -> (tensor<f32>)
     return %val : tensor<f32>
   }
 
-  func @f_callee_callee(%arg0: tensor<*x!tf.resource>) -> tensor<f32> {
+  func @f_callee_callee(%arg0: tensor<*x!tf.resource>) -> tensor<f32> attributes {sym_visibility = "private"} {
     %val = "tf.ReadVariableOp"(%arg0) : (tensor<*x!tf.resource>) -> tensor<f32>
     return %val : tensor<f32>
   }
@@ -59,7 +59,7 @@ module attributes {tf_saved_model.semantics} {
     return %val : tensor<f32>
   }
 
-  func @f_common(%arg0: tensor<*x!tf.resource>) -> tensor<f32> {
+  func @f_common(%arg0: tensor<*x!tf.resource>) -> tensor<f32> attributes {sym_visibility = "private"} {
     %val = "tf.ReadVariableOp"(%arg0) : (tensor<*x!tf.resource>) -> tensor<f32>
     return %val : tensor<f32>
   }
@@ -85,7 +85,7 @@ module attributes {tf_saved_model.semantics} {
     return %val_2 : tensor<f32>
   }
 
-  func @f_callee(%arg0: tensor<*x!tf.resource>) -> tensor<f32> {
+  func @f_callee(%arg0: tensor<*x!tf.resource>) -> tensor<f32> attributes {sym_visibility = "private"} {
     %cst_1 = constant dense<2.0> : tensor<f32>
     return %cst_1 : tensor<f32>
   }
@@ -112,13 +112,13 @@ module attributes {tf_saved_model.semantics} {
   }
 
   // CHECK: func @f_callee(%arg0: tensor<*x!tf.resource>) -> tensor<f32>
-  func @f_callee(%arg0: tensor<*x!tf.resource>) -> tensor<f32> {
+  func @f_callee(%arg0: tensor<*x!tf.resource>) -> tensor<f32> attributes {sym_visibility = "private"} {
     %val = "tf.PartitionedCall"(%arg0) {config = "", config_proto = "", executor_type = "", f = @f_callee_callee} : (tensor<*x!tf.resource>) -> (tensor<f32>)
     return %val : tensor<f32>
   }
 
   // CHECK: func @f_callee_callee(%arg0: tensor<*x!tf.resource>) -> tensor<f32>
-  func @f_callee_callee(%arg0: tensor<*x!tf.resource>) -> tensor<f32> {
+  func @f_callee_callee(%arg0: tensor<*x!tf.resource>) -> tensor<f32> attributes {sym_visibility = "private"} {
     %c0 = "tf.Const"() { value = dense<1.0> : tensor<f32> } : () -> tensor<f32>
     "tf.AssignVariableOp"(%arg0, %c0) : (tensor<*x!tf.resource>, tensor<f32>) -> ()
     return %c0 : tensor<f32>
@@ -146,13 +146,13 @@ module attributes {tf_saved_model.semantics} {
   }
 
   // CHECK: func @f_callee(%arg0: tensor<*x!tf.resource>) -> tensor<f32>
-  func @f_callee(%arg0: tensor<*x!tf.resource>) -> tensor<f32> {
+  func @f_callee(%arg0: tensor<*x!tf.resource>) -> tensor<f32> attributes {sym_visibility = "private"} {
     %val = "tf.PartitionedCall"(%arg0) {config = "", config_proto = "", executor_type = "", f = @f_callee_callee} : (tensor<*x!tf.resource>) -> (tensor<f32>)
     return %val : tensor<f32>
   }
 
   // CHECK: func @f_callee_callee(%arg0: tensor<*x!tf.resource>) -> tensor<f32>
-  func @f_callee_callee(%arg0: tensor<*x!tf.resource>) -> tensor<f32> {
+  func @f_callee_callee(%arg0: tensor<*x!tf.resource>) -> tensor<f32> attributes {sym_visibility = "private"} {
     %c0 = "tf.Const"() { value = dense<1.0> : tensor<f32> } : () -> tensor<f32>
     "tf.AssignVariableOp"(%arg0, %c0) : (tensor<*x!tf.resource>, tensor<f32>) -> ()
     return %c0 : tensor<f32>
@@ -179,13 +179,13 @@ module attributes {tf_saved_model.semantics} {
 
 
   // CHECK: func @f(%arg0: tensor<*x!tf.resource>) -> tensor<f32>
-  func @f(%arg0: tensor<*x!tf.resource>) -> tensor<f32> {
+  func @f(%arg0: tensor<*x!tf.resource>) -> tensor<f32> attributes {sym_visibility = "private"} {
     %val = "tf.PartitionedCall"(%arg0) {config = "", config_proto = "", executor_type = "", f = @g} : (tensor<*x!tf.resource>) -> (tensor<f32>)
     return %val : tensor<f32>
   }
 
   // CHECK: func @g(%arg0: tensor<*x!tf.resource>) -> tensor<f32>
-  func @g(%arg0: tensor<*x!tf.resource>) -> tensor<f32> {
+  func @g(%arg0: tensor<*x!tf.resource>) -> tensor<f32> attributes {sym_visibility = "private"} {
     %val = "tf.PartitionedCall"(%arg0) {config = "", config_proto = "", executor_type = "", f = @f} : (tensor<*x!tf.resource>) -> (tensor<f32>)
     return %val : tensor<f32>
   }
@@ -212,7 +212,7 @@ module attributes {tf_saved_model.semantics} {
 
 
   // CHECK: func @f(%arg0: tensor<*x!tf.resource>) -> tensor<f32>
-  func @f(%arg0: tensor<*x!tf.resource>) -> tensor<f32> {
+  func @f(%arg0: tensor<*x!tf.resource>) -> tensor<f32> attributes {sym_visibility = "private"} {
     %c0 = "tf.Const"() { value = dense<1.0> : tensor<f32> } : () -> tensor<f32>
     "tf.AssignAddVariableOp"(%arg0, %c0) : (tensor<*x!tf.resource>, tensor<f32>) -> ()
     return %c0 : tensor<f32>
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tpu_extract_outside_compilation.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tpu_extract_outside_compilation.mlir
index 6bb8e99d796..d88489f5da0 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tpu_extract_outside_compilation.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tpu_extract_outside_compilation.mlir
@@ -262,7 +262,6 @@ func @single_outside_compiled_input_output_single_outside_compilation(%arg0: ten
   return %1 : tensor<?xi32>
 }
 
-
 // Tests extraction of a single outside compiled cluster with multiple input/output.
 
 // CHECK-LABEL: func @multiple_outside_compiled_input_output_single_outside_compilation
@@ -439,3 +438,24 @@ func @multiple_outside_compiled_inputs_single_outside_compilation(%arg0: tensor<
 
   return %1 : tensor<?xi32>
 }
+
+// Tests only directly used results of tpu cluster are remapped with
+// parallel_execute.
+
+// CHECK-LABEL: func @remapped_results
+func @remapped_results(%arg0: tensor<?xi32>) -> tensor<?xi32> {
+  %0 = "tf.A"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
+  // CHECK: %[[REPLICATE:[0-9]*]]:2 = tf_device.replicate
+  // CHECK:   %[[PARALLEL_EXECUTE_OUTPUT:[0-9]*]]:2 = "tf_device.parallel_execute"
+  // CHECK: tf_device.return %[[PARALLEL_EXECUTE_OUTPUT]]#1 : tensor<?xi32>
+  %1:2 = tf_device.replicate([%0, %arg0] as %ri_0: tensor<?xi32>) {n = 2 : i32} {
+    %2:2 = "tf_device.cluster"() ( {
+      %3 = "tf.A"() : () -> (tensor<?xi32>)
+      %4 = "tf.B"(%3) {_xla_outside_compilation = "cluster1"} : (tensor<?xi32>) -> (tensor<?xi32>)
+      %5:2 = "tf.C"(%4) : (tensor<?xi32>) -> (tensor<?xi32>, tensor<?xi32>)
+      tf_device.return %5#0, %5#1 : tensor<?xi32>, tensor<?xi32>
+    }) {cluster_attr = "cluster_attr"} : () -> (tensor<?xi32>, tensor<?xi32>)
+    tf_device.return %2#1 : tensor<?xi32>
+  }
+  return %1 : tensor<?xi32>
+}
diff --git a/tensorflow/compiler/mlir/lite/transforms/device_index_selector.cc b/tensorflow/compiler/mlir/tensorflow/transforms/device_index_selector.cc
similarity index 92%
rename from tensorflow/compiler/mlir/lite/transforms/device_index_selector.cc
rename to tensorflow/compiler/mlir/tensorflow/transforms/device_index_selector.cc
index d4aed750dc8..550647a915a 100644
--- a/tensorflow/compiler/mlir/lite/transforms/device_index_selector.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/device_index_selector.cc
@@ -21,11 +21,11 @@ limitations under the License.
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/IR/PatternMatch.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/lite/transforms/passes.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
 
 namespace mlir {
-namespace TFL {
+namespace TF {
 namespace {
 
 // Folds the DeviceIndex op to a constant value. The DeviceIndex return the
@@ -55,8 +55,8 @@ void DeviceIndexSelector::runOnOperation() {
   // Convert all the DeviceIndex ops to constant values.
   func.getBody().walk([](TF::DeviceIndexOp op) {
     // This just selects the default in all cases where DeviceIndex feeds into
-    // tf.Case. This could be enhanced based on explicit TFLite specification or
-    // TAC in future.
+    // tf.Case. This could be enhanced to have some sort of policy in the
+    // future.
     OpBuilder b(op);
     RankedTensorType type = RankedTensorType::get({}, b.getIntegerType(32));
     int index = op.device_names().size();
@@ -79,7 +79,7 @@ std::unique_ptr<OperationPass<FuncOp>> CreateDeviceIndexSelectorPass() {
 }
 
 static PassRegistration<DeviceIndexSelector> pass(
-    "tfl-device-index-selector", "Fold tf.DeviceIndex to constant");
+    "tf-device-index-selector", "Fold tf.DeviceIndex to constant");
 
-}  // namespace TFL
+}  // namespace TF
 }  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/fold_switch.cc b/tensorflow/compiler/mlir/tensorflow/transforms/fold_switch.cc
index 4d26747ebdc..b47378762a9 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/fold_switch.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/fold_switch.cc
@@ -199,7 +199,7 @@ static void MatchSwitchFoldOps(tf_executor::SwitchOp switch_op,
 // Folds merge nodes with only a single non-dead input.
 static LogicalResult FoldMergeNodes(FuncOp function, const DeadQueue& queue) {
   // Create builder for val_index of MergeOp.
-  auto* block = &function.getBlocks().front();
+  auto* block = &function.front();
   OpBuilder builder = OpBuilder::atBlockEnd(block);
   auto type = builder.getIntegerType(32);
   auto build_index = [&](Location loc, int value) {
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/fused_kernel_matcher.cc b/tensorflow/compiler/mlir/tensorflow/transforms/fused_kernel_matcher.cc
index 4b10550df7b..d10f5e26e8f 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/fused_kernel_matcher.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/fused_kernel_matcher.cc
@@ -52,11 +52,6 @@ struct FusedKernelMatcherPass
   void runOnFunction() override;
 };
 
-// Returns an op's name with the dialect prefix stripped off.
-StringRef GetOpNameWithoutDialect(Operation *op) {
-  return op->getName().getStringRef().split(".").second;
-}
-
 bool IsActivationFunction(Operation *op) {
   return isa<EluOp>(op) || isa<ReluOp>(op) || isa<Relu6Op>(op);
 }
@@ -128,8 +123,8 @@ class FuseContractionWithBiasAdd : public OpRewritePattern<SrcOpT> {
     }
 
     SmallVector<Location, 3> locations{contraction.getLoc(), bias_add.getLoc()};
-    SmallVector<Attribute, 2> fused_ops{
-        StringAttr::get(GetOpNameWithoutDialect(bias_add), context)};
+    SmallVector<Attribute, 2> fused_ops{StringAttr::get(
+        bias_add.getOperation()->getName().stripDialect(), context)};
 
     // BiasAdd may or may not feed into an activation function.
     auto activation = GetActivation(bias_add);
@@ -143,7 +138,7 @@ class FuseContractionWithBiasAdd : public OpRewritePattern<SrcOpT> {
     if (fuse_activation) {
       locations.push_back(activation->getLoc());
       fused_ops.push_back(
-          StringAttr::get(GetOpNameWithoutDialect(activation), context));
+          StringAttr::get(activation->getName().stripDialect(), context));
       result_type = activation->getResultTypes().front();
     } else {
       result_type = bias_add.getResult().getType();
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/lift_variables_test_pass.h b/tensorflow/compiler/mlir/tensorflow/transforms/lift_variables_test_pass.h
index faecdf04368..0e6d844bed3 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/lift_variables_test_pass.h
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/lift_variables_test_pass.h
@@ -96,15 +96,19 @@ class FakeSession : public tensorflow::Session {
     for (const std::string& output_name : output_names) {
       Tensor output;
       if (output_name == "dense/bias") {
-        outputs->push_back(
-            Tensor(tensorflow::DT_FLOAT, tensorflow::TensorShape({50})));
+        Tensor t = Tensor(tensorflow::DT_FLOAT, tensorflow::TensorShape({50}));
+        t.flat<float>().setZero();
+        outputs->push_back(t);
       } else if (output_name == "dense/kernel") {
-        outputs->push_back(
-            Tensor(tensorflow::DT_FLOAT, tensorflow::TensorShape({100, 50})));
+        Tensor t =
+            Tensor(tensorflow::DT_FLOAT, tensorflow::TensorShape({100, 50}));
+        t.flat<float>().setZero();
+        outputs->push_back(t);
       } else {
         // Create a scalar float tensor.
-        outputs->push_back(
-            Tensor(tensorflow::DT_FLOAT, tensorflow::TensorShape({})));
+        Tensor t = Tensor(tensorflow::DT_FLOAT, tensorflow::TensorShape({}));
+        t.flat<float>()(0) = 1.0f;
+        outputs->push_back(t);
       }
     }
     return Status::OK();
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/mark_function_visibility.cc b/tensorflow/compiler/mlir/tensorflow/transforms/mark_function_visibility.cc
deleted file mode 100644
index 31a80a4ecdb..00000000000
--- a/tensorflow/compiler/mlir/tensorflow/transforms/mark_function_visibility.cc
+++ /dev/null
@@ -1,165 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "llvm/ADT/STLExtras.h"
-#include "mlir/IR/Module.h"  // from @llvm-project
-#include "mlir/Pass/Pass.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.h"
-
-#define DEBUG_TYPE "tf-shape-inference"
-
-namespace mlir {
-
-namespace {
-
-LogicalResult MarkFunctionVisibility(
-    ModuleOp module, llvm::function_ref<bool(FuncOp func)> IsExternalVisible) {
-  LogicalResult result = success();
-
-  for (auto func : module.getOps<FuncOp>()) {
-    FuncOp::Visibility old_visibility = func.getVisibility();
-
-    FuncOp::Visibility visibility = IsExternalVisible(func)
-                                        ? FuncOp::Visibility::Public
-                                        : FuncOp::Visibility::Private;
-
-    auto get_visibility_name = [](FuncOp::Visibility v) {
-      return v == FuncOp::Visibility::Public
-                 ? "public"
-                 : v == FuncOp::Visibility::Private ? "private" : "nested";
-    };
-
-    if (old_visibility != SymbolTable::Visibility::Public &&
-        old_visibility != visibility) {
-      result = func.emitError()
-               << "can't overwrite the visibility of function "
-               << func.getName() << " with "
-               << get_visibility_name(old_visibility) << " visibility";
-    }
-
-    LLVM_DEBUG(llvm::dbgs()
-               << "function " << func.getName() << " has "
-               << get_visibility_name(visibility) << " visibility \n");
-
-    func.setVisibility(visibility);
-  }
-
-  return result;
-}
-
-}  // anonymous namespace
-
-namespace TF {
-
-LogicalResult MarkFunctionVisibilityUsingEntryFunctionSpecification(
-    ModuleOp module) {
-  auto HasEntryFunctionSpecification = [](FuncOp func) -> bool {
-    auto attrs = func.getAttrOfType<mlir::DictionaryAttr>("tf.entry_function");
-    return attrs && !attrs.empty();
-  };
-  return MarkFunctionVisibility(module, HasEntryFunctionSpecification);
-}
-
-namespace {
-struct MarkFunctionVisibilityUsingEntryFunctionSpecificationPass
-    : public PassWrapper<
-          MarkFunctionVisibilityUsingEntryFunctionSpecificationPass,
-          OperationPass<ModuleOp>> {
-  void runOnOperation() override {
-    if (failed(MarkFunctionVisibilityUsingEntryFunctionSpecification(
-            getOperation()))) {
-      signalPassFailure();
-    }
-  }
-};
-}  // namespace
-
-static PassRegistration<
-    MarkFunctionVisibilityUsingEntryFunctionSpecificationPass>
-    pass("tf-mark-func-visibility",
-         "Use tf.entry_function to mark function visibility.");
-
-std::unique_ptr<OperationPass<ModuleOp>>
-CreateMarkFunctionVisibilityUsingEntryFunctionSpecificationPass() {
-  return std::make_unique<
-      MarkFunctionVisibilityUsingEntryFunctionSpecificationPass>();
-}
-
-// Marks the main function with public visibility, while other functions are
-// marked with private visibility.
-LogicalResult MarkOnlyMainFunctionWithPublicVisibility(ModuleOp module) {
-  for (auto func : module.getOps<FuncOp>()) {
-    if (func.getName() == "main") {
-      func.setVisibility(FuncOp::Visibility::Public);
-    } else {
-      func.setVisibility(FuncOp::Visibility::Private);
-    }
-  }
-  return success();
-}
-
-namespace {
-struct MarkOnlyMainFunctionWithPublicVisibilityPass
-    : public PassWrapper<MarkOnlyMainFunctionWithPublicVisibilityPass,
-                         OperationPass<ModuleOp>> {
-  void runOnOperation() override {
-    if (failed(MarkOnlyMainFunctionWithPublicVisibility(getOperation()))) {
-      signalPassFailure();
-    }
-  }
-};
-}  // namespace
-
-std::unique_ptr<OperationPass<ModuleOp>>
-CreateMarkOnlyMainFunctionWithPublicVisibilityPass() {
-  return std::make_unique<MarkOnlyMainFunctionWithPublicVisibilityPass>();
-}
-
-}  // namespace TF
-
-namespace tf_saved_model {
-
-static LogicalResult MarkFunctionVisibilityUsingSavedModelLinkage(
-    ModuleOp module) {
-  if (!tf_saved_model::HasTfSavedModelSemantics(module)) {
-    return success();
-  }
-  return MarkFunctionVisibility(module, tf_saved_model::IsExported);
-}
-
-namespace {
-struct MarkFunctionVisibilityUsingSavedModelLinkagePass
-    : public PassWrapper<MarkFunctionVisibilityUsingSavedModelLinkagePass,
-                         OperationPass<ModuleOp>> {
-  void runOnOperation() override {
-    if (failed(MarkFunctionVisibilityUsingSavedModelLinkage(getOperation()))) {
-      signalPassFailure();
-    }
-  }
-};
-}  // namespace
-
-static PassRegistration<MarkFunctionVisibilityUsingSavedModelLinkagePass> pass(
-    "tf-saved-model-mark-func-visibility",
-    "Use tf_saved_model linkage information to mark function visibility.");
-
-std::unique_ptr<OperationPass<ModuleOp>>
-CreateMarkFunctionVisibilityUsingSavedModelLinkagePass() {
-  return std::make_unique<MarkFunctionVisibilityUsingSavedModelLinkagePass>();
-}
-
-}  // namespace tf_saved_model
-
-}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/materialize_mlir_passthrough_op.cc b/tensorflow/compiler/mlir/tensorflow/transforms/materialize_mlir_passthrough_op.cc
index 94fdfb310ac..3ed27d7ce30 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/materialize_mlir_passthrough_op.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/materialize_mlir_passthrough_op.cc
@@ -71,7 +71,7 @@ void MaterializePassthroughOpPass::runOnFunction() {
       return;
     }
     Region &body = main.getBody();
-    if (body.getBlocks().size() != 1) {
+    if (!llvm::hasSingleElement(body)) {
       op->emitError() << "MLIR Opaque Op expects a main() entry point with a "
                          "single block\n";
       return;
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/optimize_global_tensors.cc b/tensorflow/compiler/mlir/tensorflow/transforms/optimize_global_tensors.cc
index cd8f988fd5f..07cc6203cbd 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/optimize_global_tensors.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/optimize_global_tensors.cc
@@ -56,14 +56,14 @@ struct GlobalTensorUse {
 using GlobalTensorUsesMap =
     std::map<GlobalTensorOp, std::vector<GlobalTensorUse>>;
 
-static bool IsResourceType(Type type) {
+bool IsResourceType(Type type) {
   if (auto tensor_type = type.dyn_cast<TensorType>()) {
     return tensor_type.getElementType().isa<TF::ResourceType>();
   }
   return false;
 }
 
-static bool IsResource(Value value) { return IsResourceType(value.getType()); }
+bool IsResource(Value value) { return IsResourceType(value.getType()); }
 
 class ResourceAnalyzer {
  public:
@@ -129,30 +129,24 @@ class ResourceAnalyzer {
       // this errs on the side of being conservative. We should improve
       // this by using either a property or a trait that clearly
       // identifies ops with resource mutating behavior.
-      if (PropagatePotentiallyWrittenWithinUnhandledOp(op)) {
-        return;
-      }
+      PropagatePotentiallyWrittenWithinUnhandledOp(op);
     });
     return success();
   }
 
   // If an op is not one of the handled ones, we assume all resource usages
   // within its purview are mutating in nature.
-  bool PropagatePotentiallyWrittenWithinUnhandledOp(Operation* op) {
+  void PropagatePotentiallyWrittenWithinUnhandledOp(Operation* op) {
     for (auto operand : op->getOperands()) {
       if (IsResource(operand)) {
         SetPotentiallyWritten(operand);
-        return true;
       }
     }
-    bool uses_resources = false;
     visitUsedValuesDefinedAbove(op->getRegions(), [&](OpOperand* operand) {
       if (IsResource(operand->get())) {
         SetPotentiallyWritten(operand->get());
-        uses_resources = true;
       }
     });
-    return uses_resources;
   }
 
   // Given a funcOp associated with the callee and operands from the
@@ -212,7 +206,7 @@ bool IsImmutable(GlobalTensorOp global_tensor,
   return true;
 }
 
-static GlobalTensorUsesMap CreateGlobalTensorUsesMap(ModuleOp module) {
+GlobalTensorUsesMap CreateGlobalTensorUsesMap(ModuleOp module) {
   GlobalTensorUsesMap global_tensor_uses;
 
   SymbolTable symbol_table(module);
@@ -293,13 +287,13 @@ void OptimizeGlobalTensorsPass::runOnOperation() {
   EraseUnusedGlobalTensors(module, global_tensor_uses);
 }
 
-}  // namespace
-
 // For "opt" to pick up this pass.
-static PassRegistration<OptimizeGlobalTensorsPass> pass(
+PassRegistration<OptimizeGlobalTensorsPass> pass(
     "tf-saved-model-optimize-global-tensors",
     "Optimize tf_saved_model.global_tensor's.");
 
+}  // namespace
+
 std::unique_ptr<OperationPass<ModuleOp>> CreateOptimizeGlobalTensorsPass() {
   return std::make_unique<OptimizeGlobalTensorsPass>();
 }
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/passes.h b/tensorflow/compiler/mlir/tensorflow/transforms/passes.h
index 7158d0f6be0..168b317641d 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/passes.h
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/passes.h
@@ -117,21 +117,6 @@ std::unique_ptr<OperationPass<ModuleOp>> CreatePromoteVarHandlesToArgsPass();
 std::unique_ptr<OperationPass<FuncOp>>
 CreateConvertReadonlyReferenceVariablesToResourceVariablesPass();
 
-// Marks function visibility using tf.entry_function specification. That is,
-// functions with tf.entry_function attributes are marked with public
-// visibility while the other functions are marked with private visibility.
-LogicalResult MarkFunctionVisibilityUsingEntryFunctionSpecification(
-    ModuleOp module);
-// Creates a pass that uses tf.entry_function specification to mark function
-// visibility.
-std::unique_ptr<OperationPass<ModuleOp>>
-CreateMarkFunctionVisibilityUsingEntryFunctionSpecificationPass();
-
-// Creates a pass that marks the main function with public visibility, while
-// other functions are marked with private visibility.
-std::unique_ptr<OperationPass<ModuleOp>>
-CreateMarkOnlyMainFunctionWithPublicVisibilityPass();
-
 // Creates a simple device assignment pass on TF dialect for CoreRT use case.
 std::unique_ptr<OperationPass<FuncOp>> CreateSimpleTFDeviceAssignmentPass(
     llvm::StringRef default_device);
@@ -162,6 +147,9 @@ std::unique_ptr<OperationPass<FuncOp>> CreateLegalizeHloToTfPass();
 // generally used beyond exporting to runtimes that supports these ops. In the
 // future these fusions may be codegen'd automatically.
 std::unique_ptr<OperationPass<FuncOp>> CreateFusedKernelMatcherPass();
+
+// Creates function pass to select device index/fold tf.DeviceIndex.
+std::unique_ptr<OperationPass<FuncOp>> CreateDeviceIndexSelectorPass();
 }  // namespace TF
 
 namespace tf_executor {
@@ -296,7 +284,8 @@ std::unique_ptr<OperationPass<FuncOp>> CreateTPUHostComputationExpansionPass();
 
 // Creates a pass that extract outside compilation (CPU ops inside TPU cluster)
 // ops to a separate parallel_execute region to run on CPU.
-std::unique_ptr<OperationPass<FuncOp>> CreateTPUExtractOutsideCompilationPass();
+std::unique_ptr<OperationPass<ModuleOp>>
+CreateTPUExtractOutsideCompilationPass();
 
 // Populates the supplied passmanager with the passes required to run the
 void CreateTPUBridgePipeline(OpPassManager& pm);
@@ -315,13 +304,6 @@ std::unique_ptr<OperationPass<ModuleOp>> CreateOptimizeGlobalTensorsPass();
 // Creates a pass that freezes tf_saved_model.global_tensor ops.
 std::unique_ptr<OperationPass<ModuleOp>> CreateFreezeGlobalTensorsPass();
 
-// Creates a pass that uses tf_saved_model dialect linkage information
-// to mark function visibility. That is, exported functions are marked with
-// public visibility while the other functions are marked with private
-// visibility.
-std::unique_ptr<OperationPass<ModuleOp>>
-CreateMarkFunctionVisibilityUsingSavedModelLinkagePass();
-
 }  // namespace tf_saved_model
 
 }  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/promote_resources_to_args.cc b/tensorflow/compiler/mlir/tensorflow/transforms/promote_resources_to_args.cc
index cece23b4750..af36770f496 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/promote_resources_to_args.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/promote_resources_to_args.cc
@@ -80,11 +80,11 @@ constexpr char kResourceNameArgAttr[] = "tf.resource_name";
 
 // Checks if a function has only one block.
 mlir::LogicalResult CheckSingleBlockFunction(FuncOp function) {
-  if (!hasSingleElement(function.getBlocks()))
+  if (!llvm::hasSingleElement(function)) {
     return function.emitError()
            << "expects function '" << function.getName()
            << "' to have 1 block, got " << function.getBlocks().size();
-
+  }
   return success();
 }
 
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/resource_op_lifting.cc b/tensorflow/compiler/mlir/tensorflow/transforms/resource_op_lifting.cc
index ed7ebc25c9f..799ab3a0f0d 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/resource_op_lifting.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/resource_op_lifting.cc
@@ -1113,7 +1113,7 @@ LogicalResult ResourceLiftingForFunctionalControlFlow(FuncOp function) {
   // This routine should only be called when control flow operations are still
   // represented with TF IfOp and WhileOp operations. In this case, there should
   // be only one basic blocks in the MLIR representation.
-  if (!hasSingleElement(function.getBlocks())) {
+  if (!llvm::hasSingleElement(function)) {
     return function.emitError()
            << "expect the function to have 1 block while it has "
            << function.getBlocks().size();
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.cc b/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.cc
index 7e4baadc397..33ccf5caff2 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.cc
@@ -215,6 +215,10 @@ bool InferShapeForNonTFDialectOperation(Operation* op, Dialect* tf_dialect) {
     return InferShapeForPassThroughOps(
         tensor_cast.getOperation()->getOperands(), op, tf_dialect);
   }
+  if (auto launch_op = dyn_cast<tf_device::LaunchOp>(op)) {
+    return InferShapeForPassThroughOps(
+        launch_op.GetBody().getTerminator()->getOperands(), op, tf_dialect);
+  }
   return false;
 }
 
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/stack_ops_decomposition.cc b/tensorflow/compiler/mlir/tensorflow/transforms/stack_ops_decomposition.cc
index c349c2b4c3e..734a7d04a86 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/stack_ops_decomposition.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/stack_ops_decomposition.cc
@@ -343,7 +343,7 @@ LogicalResult HandlePartitionedCallOp(
   }
   llvm::SmallDenseMap<Value, Value> callee_map;
   FuncOp lowered_callee = callee;
-  if (callee.getVisibility() != SymbolTable::Visibility::Private) {
+  if (!callee.isPrivate()) {
     // Clone non-private callee in case of signature change.
     lowered_callee = callee.clone();
     lowered_callee.setVisibility(SymbolTable::Visibility::Private);
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tensor_array_ops_decomposition.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tensor_array_ops_decomposition.cc
index cfeb2b1f031..a9e1243714e 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tensor_array_ops_decomposition.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tensor_array_ops_decomposition.cc
@@ -759,7 +759,7 @@ LogicalResult HandlePartitionedCallOp(
     return it->getSecond().accumulate_on_write;
   };
   FuncOp lowered_callee = callee;
-  if (callee.getVisibility() != SymbolTable::Visibility::Private) {
+  if (!callee.isPrivate()) {
     // Clone non-private callee in case of signature change.
     lowered_callee = callee.clone();
     lowered_callee.setVisibility(SymbolTable::Visibility::Private);
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tensor_list_ops_decomposition.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tensor_list_ops_decomposition.cc
index 9733bfe2290..b118ab6c6c9 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tensor_list_ops_decomposition.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tensor_list_ops_decomposition.cc
@@ -322,7 +322,7 @@ LogicalResult HandlePartitionedCallOp(
   // Rewrite the callee.
   llvm::SmallDenseMap<Value, SizeInfo> callee_map;
   FuncOp lowered_callee = callee;
-  if (callee.getVisibility() != SymbolTable::Visibility::Private) {
+  if (!callee.isPrivate()) {
     // Clone non-private callee in case of signature change.
     lowered_callee = callee.clone();
     lowered_callee.setVisibility(SymbolTable::Visibility::Private);
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_extract_outside_compilation.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_extract_outside_compilation.cc
index 54600faca4b..503c9869557 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_extract_outside_compilation.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_extract_outside_compilation.cc
@@ -49,8 +49,9 @@ using OutsideClusterMap =
 // TODO(b/154363171): Add example tranformations.
 
 struct TPUExtractOutsideCompilation
-    : public PassWrapper<TPUExtractOutsideCompilation, FunctionPass> {
-  void runOnFunction() override;
+    : public PassWrapper<TPUExtractOutsideCompilation,
+                         OperationPass<ModuleOp>> {
+  void runOnOperation() override;
 };
 
 // Collects and clusters ops in `block` with the same `_xla_outside_compilation`
@@ -108,18 +109,6 @@ tf_device::LaunchOp CreateLaunchOpForOutsideCluster(
   return launch_op;
 }
 
-// Propagates the return from `parallel_execute_op` to parent replicate
-// op if it exists.
-void PropagateParallelExecuteReturnToReplicate(
-    tf_device::ParallelExecuteOp parallel_execute_op) {
-  // Update the return for the parallel_execute op parent.
-  auto replicate = llvm::dyn_cast_or_null<tf_device::ReplicateOp>(
-      parallel_execute_op.getParentOp());
-  if (replicate)
-    replicate.GetBody().getTerminator()->setOperands(
-        parallel_execute_op.execute_outputs());
-}
-
 // Extracts all externally provided operands of `cluster_ops`.
 llvm::SmallSetVector<Value, 4> GetExternalOperands(
     llvm::ArrayRef<Operation*> cluster_ops) {
@@ -305,12 +294,21 @@ void CreateParallelExecuteFromOutsideClusters(
   tpu_cluster.getOperation()->moveBefore(
       parallel_execute_tpu_block.getTerminator());
 
-  PropagateParallelExecuteReturnToReplicate(parallel_execute_op);
+  // Remap cluster results with parallel_execute results if user is outside of
+  // parallel_execute.
+  for (auto result :
+       llvm::zip(tpu_cluster.getResults(), parallel_execute_op.getResults())) {
+    Value tpu_cluster_result = std::get<0>(result);
+    Value parallel_execute_result = std::get<1>(result);
+    for (auto& use : llvm::make_early_inc_range(tpu_cluster_result.getUses()))
+      if (!parallel_execute_op.getOperation()->isProperAncestor(use.getOwner()))
+        use.set(parallel_execute_result);
+  }
 }
 
-void TPUExtractOutsideCompilation::runOnFunction() {
+void TPUExtractOutsideCompilation::runOnOperation() {
   auto extract_result =
-      getFunction().walk([&](tf_device::ClusterOp tpu_cluster) {
+      getOperation().walk([&](tf_device::ClusterOp tpu_cluster) {
         OutsideClusterMap clusters;
         if (failed(CollectAndGroupOutsideClusterOps(&tpu_cluster.GetBody(),
                                                     &clusters)))
@@ -328,7 +326,7 @@ void TPUExtractOutsideCompilation::runOnFunction() {
 
 }  // namespace
 
-std::unique_ptr<OperationPass<FuncOp>>
+std::unique_ptr<OperationPass<ModuleOp>>
 CreateTPUExtractOutsideCompilationPass() {
   return std::make_unique<TPUExtractOutsideCompilation>();
 }
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_rewrite_pass.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_rewrite_pass.cc
index 696882cd105..ec9b3df525f 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_rewrite_pass.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_rewrite_pass.cc
@@ -146,6 +146,9 @@ LogicalResult EncapsulateFuncAndSerialize(FuncOp entry_func,
       // We can simply change name of TPU program's main function because there
       // should be no other reference to it.
       clone.setName("main");
+      clone.setVisibility(FuncOp::Visibility::Public);
+    } else {
+      clone.setVisibility(FuncOp::Visibility::Private);
     }
     symbol_table.insert(clone);
   }
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_sharding_identification_pass.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_sharding_identification_pass.cc
index f8b6e364f55..b05e87c6485 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_sharding_identification_pass.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_sharding_identification_pass.cc
@@ -159,8 +159,7 @@ llvm::SmallVector<FunctionAndArgumentInfo, 4> ExtractFunctionsConnectedToArg(
   while (!functions_to_parse.empty()) {
     llvm::SmallVector<FunctionAndArgumentInfo, 4> newly_discovered_functions;
     for (auto function_info : functions_to_parse) {
-      Block& func_entry_block =
-          function_info.func.getBody().getBlocks().front();
+      Block& func_entry_block = function_info.func.front();
       auto argument =
           func_entry_block.getArgument(function_info.argument_index);
 
@@ -186,8 +185,7 @@ void IdentifyXlaShardingForComputationInputs(
     StringRef logical_core_0_sharding, tf_device::ClusterFuncOp cluster_func_op,
     FuncOp cluster_function, Builder* builder) {
   // Look up function definition from module.
-  Block& cluster_function_block =
-      cluster_function.getBody().getBlocks().front();
+  Block& cluster_function_block = cluster_function.front();
   ModuleOp module = cluster_func_op.getParentOfType<ModuleOp>();
 
   llvm::SmallVector<llvm::StringRef, 8> sharding_for_args(
@@ -215,8 +213,7 @@ void IdentifyXlaShardingForComputationInputs(
 
         const int function_argument_index = function_arg_info.argument_index;
         auto& parsed_function = function_arg_info.func;
-        Block& parsed_function_block =
-            parsed_function.getBody().getBlocks().front();
+        Block& parsed_function_block = parsed_function.front();
         arg_sharding = ParseInputSharding(
             parsed_function_block.getArgument(function_argument_index));
       }
@@ -245,7 +242,7 @@ void IdentifyXlaShardingForComputationOutputs(
     tf_device::ClusterFuncOp cluster_func, Builder* builder) {
   // By default return values from logical core 0 is used if no sharding
   // configuration is defined.
-  Block& function_block = func.getBody().getBlocks().front();
+  Block& function_block = func.front();
   Operation* terminator = function_block.getTerminator();
   llvm::SmallVector<llvm::StringRef, 8> sharding_for_rets(
       terminator->getNumOperands(), logical_core_0_sharding);
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_variable_runtime_reformatting.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_variable_runtime_reformatting.cc
index ec4a25c6fdd..d88982d9ee7 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_variable_runtime_reformatting.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_variable_runtime_reformatting.cc
@@ -261,7 +261,6 @@ tf_device::ReplicateOp AddInputsToReplicateOp(
   // placed in logical core 0.
   // TODO(b/148913020): Remove this constraint once model parallelism is
   // supported.
-  assert(devices.size() == 1);
   assert(devices.find(tensorflow::GetDeviceAliasForLogicalCore(0))
              ->getSecond()
              .size() == num_replicas);
@@ -369,9 +368,6 @@ llvm::SmallVector<TF::VarHandleOp, 4> CreateStateVars(
 
   // TODO(b/148913020): Remove this constraint once model parallelism is
   // supported.
-  assert(devices.size() == 1 &&
-         "As model parallelism is not supported yet, tf_device.replicate "
-         "`devices` attribute should have one dictionary element.");
   const auto& device_list =
       devices.find(tensorflow::GetDeviceAliasForLogicalCore(0))->getSecond();
 
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.cc b/tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.cc
index 262f6f4e50c..8cd14894f8f 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.cc
+++ b/tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.cc
@@ -128,7 +128,7 @@ class LegalizedOpOrValLocNameMapper : public OpOrArgLocNameMapper {
 Status HasSingleGraphSingleOpIslandsFunctions(mlir::ModuleOp module) {
   Status status = Status::OK();
   module.walk([&](mlir::FuncOp function) {
-    if (function.getBlocks().size() != 1) {
+    if (!llvm::hasSingleElement(function)) {
       status = errors::FailedPrecondition(
           kInvalidExecutorGraphMsg,
           "only single block functions are supported.");
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc b/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc
index 820d0ce31fb..fea809c0798 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc
+++ b/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc
@@ -60,6 +60,8 @@ limitations under the License.
 #include "mlir/IR/Types.h"  // from @llvm-project
 #include "mlir/IR/Verifier.h"  // from @llvm-project
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "tensorflow/cc/saved_model/constants.h"
+#include "tensorflow/cc/saved_model/loader_util.h"
 #include "tensorflow/compiler/jit/shape_inference_helpers.h"
 #include "tensorflow/compiler/mlir/op_or_arg_name_mapper.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_attributes.h"
@@ -116,6 +118,7 @@ using mlir::NamedAttrList;
 using mlir::TensorType;
 using mlir::TF::VarHandleOp;
 using mlir::tf_saved_model::GlobalTensorOp;
+using mlir::tf_saved_model::SessionInitializerOp;
 using stream_executor::port::StatusOr;
 
 namespace {
@@ -2955,6 +2958,13 @@ void SortSavedModelModule(mlir::ModuleOp module) {
     named_global_tensor.global_tensor.getOperation()->moveBefore(
         &module.getBody()->front());
   }
+
+  auto initializers = module.getOps<SessionInitializerOp>();
+  if (!initializers.empty()) {
+    (*initializers.begin())
+        .getOperation()
+        ->moveBefore(&module.getBody()->front());
+  }
 }
 
 Status CreateSavedModelIR(
@@ -3241,17 +3251,32 @@ class SavedModelSignatureDefImporter {
                                  absl::Span<std::string> exported_names,
                                  mlir::MLIRContext* context)
       : bundle_(bundle),
+        flib_def_(OpRegistry::Global(), graph_def().library()),
+        debug_info_(),
         exported_names_(exported_names),
-        module_(mlir::ModuleOp::create(mlir::UnknownLoc::get(context))) {}
+        module_(mlir::ModuleOp::create(mlir::UnknownLoc::get(context))) {
+    // debug_info might not be loaded with loader_lite.
+    if (bundle_.debug_info != nullptr) debug_info_ = *bundle_.debug_info;
+  }
 
   // Converts the SavedModel to the SavedModel dialect. Creates an MLIR function
   // for each signature.
   StatusOr<mlir::OwningModuleRef> ConvertSignatures();
-  Status ConvertSignature(const GraphDef& graphdef,
-                          const std::string& sig_def_key,
-                          const SignatureDef& signature_def,
-                          const GraphDebugInfo& debug_info,
-                          const FunctionLibraryDefinition& flib_def);
+  Status ConvertSignature(const std::string& sig_def_key,
+                          const SignatureDef& signature_def);
+
+  // Converts the initialization graph in the SavedModel to an MLIR function.
+  Status ConvertInitializer();
+
+  // Converts a graph with feeds and fetches to an MLIR function.
+  StatusOr<mlir::OwningModuleRef> ConvertGraph(
+      const std::string& name,
+      const std::vector<std::pair<std::string, TensorInfo>>& inputs,
+      const std::vector<std::pair<std::string, TensorInfo>>& outputs,
+      const std::vector<std::string> control_outputs);
+
+  // Coarsens the islands in `module_`.
+  Status CoarsenIslands();
 
   // Creates GlobalTensorOp for each variable and moves each VarHandle op to
   // the enclosing function's arguments.
@@ -3262,6 +3287,10 @@ class SavedModelSignatureDefImporter {
   // tensor's shape is used to provide the most accurate nested shape.
   void LiftVariable(VarHandleOp op, GlobalTensorOp global_tensor);
 
+  // Removes the variable and related ops in the init function if it is already
+  // imported as a global tensor.
+  void RemoveVariable(VarHandleOp op);
+
   using VarGlobalMap = llvm::MapVector<
       llvm::StringRef,
       std::pair<GlobalTensorOp, llvm::SmallVector<VarHandleOp, 2>>>;
@@ -3273,18 +3302,68 @@ class SavedModelSignatureDefImporter {
   GraphImportConfig::InputArrays ParseInputArrays(
       const std::vector<std::pair<std::string, TensorInfo>>& inputs);
 
+  const GraphDef& graph_def() const {
+    return bundle_.meta_graph_def.graph_def();
+  }
+  const FunctionLibraryDefinition& flib_def() const { return flib_def_; }
+  const GraphDebugInfo& debug_info() const { return debug_info_; }
+
   const SavedModelBundle& bundle_;
+  FunctionLibraryDefinition flib_def_;
+  GraphDebugInfo debug_info_;
   absl::Span<std::string> exported_names_;
   mlir::OwningModuleRef module_;
 };
 
+Status SavedModelSignatureDefImporter::ConvertInitializer() {
+  std::vector<AssetFileDef> asset_file_defs;
+  TF_RETURN_IF_ERROR(
+      internal::GetAssetFileDefs(bundle_.meta_graph_def, &asset_file_defs));
+
+  if (!asset_file_defs.empty())
+    return errors::Unimplemented(
+        absl::StrCat("Assets are not supported in signaturedef importer"));
+
+  std::string init_node_name;
+  TF_RETURN_IF_ERROR(
+      internal::GetInitOp("", bundle_.meta_graph_def, &init_node_name));
+
+  if (init_node_name.empty()) return Status::OK();
+
+  TF_ASSIGN_OR_RETURN(auto sub_module,
+                      ConvertGraph(init_node_name, {}, {}, {init_node_name}));
+
+  mlir::SymbolTable symbol_table(*sub_module);
+
+  auto init_func_op = symbol_table.lookup<mlir::FuncOp>(init_node_name);
+
+  init_func_op.removeAttr("tf.entry_function");
+
+  mlir::OpBuilder builder(module_->getBodyRegion());
+
+  // Set the exported name of init function to an reserved name for
+  // tf_saved_model.
+  init_func_op.setAttr(
+      "tf_saved_model.exported_names",
+      builder.getStrArrayAttr({"__tf_saved_model_session_initializer"}));
+
+  builder.create<mlir::tf_saved_model::SessionInitializerOp>(
+      module_->getLoc(), builder.getSymbolRefAttr(init_func_op.getName()));
+
+  // Move the converted functions to top level MLIR module.
+  auto* block = module_->getBody();
+  auto* sub_block = sub_module->getBody();
+  block->getOperations().splice(
+      mlir::Block::iterator(block->getTerminator()), sub_block->getOperations(),
+      sub_block->begin(), mlir::Block::iterator(sub_block->getTerminator()));
+
+  return Status::OK();
+}
+
 StatusOr<mlir::OwningModuleRef>
 SavedModelSignatureDefImporter::ConvertSignatures() {
   const auto& signatures = bundle_.GetSignatures();
-  const auto& graphdef = bundle_.meta_graph_def.graph_def();
-  PopulateTfVersions(module_.get(), graphdef.versions());
-
-  FunctionLibraryDefinition flib_def(OpRegistry::Global(), graphdef.library());
+  PopulateTfVersions(module_.get(), graph_def().versions());
 
   // debug_info might not be loaded with loader_lite.
   GraphDebugInfo debug_info;
@@ -3307,23 +3386,49 @@ SavedModelSignatureDefImporter::ConvertSignatures() {
       continue;
     }
 
-    TF_RETURN_IF_ERROR(ConvertSignature(graphdef, sig_def_key, signature_def,
-                                        debug_info, flib_def));
+    TF_RETURN_IF_ERROR(ConvertSignature(sig_def_key, signature_def));
   }
-  TF_RETURN_IF_ERROR(LiftVariables());
+
+  TF_RETURN_IF_ERROR(ConvertInitializer());
 
   mlir::OpBuilder builder(module_->getBodyRegion());
   module_->setAttr("tf_saved_model.semantics", builder.getUnitAttr());
+
+  TF_RETURN_IF_ERROR(CoarsenIslands());
+  TF_RETURN_IF_ERROR(LiftVariables());
+
   SortSavedModelModule(*module_);
   MarkSavedModelFunctionVisibility(*module_);
 
   return std::move(module_);
 }
 
+StatusOr<mlir::OwningModuleRef> SavedModelSignatureDefImporter::ConvertGraph(
+    const std::string& name,
+    const std::vector<std::pair<std::string, TensorInfo>>& inputs,
+    const std::vector<std::pair<std::string, TensorInfo>>& outputs,
+    const std::vector<std::string> control_outputs) {
+  GraphImportConfig specs;
+  specs.prune_unused_nodes = true;
+  specs.inputs = ParseInputArrays(inputs);
+  for (auto& output : outputs) specs.outputs.push_back(output.second.name());
+  specs.control_outputs = control_outputs;
+
+  // Convert sub-graphdef to sub-graph.
+  GraphConstructorOptions options;
+  options.allow_internal_ops = true;
+  options.add_default_attributes = true;
+  Graph graph(OpRegistry::Global());
+
+  TF_RETURN_IF_ERROR(ConvertGraphDefToGraph(options, graph_def(), &graph));
+
+  // Convert sub-graph to MLIR module.true
+  return GraphDefImporter::Convert(module_->getContext(), graph, debug_info(),
+                                   flib_def(), specs, name);
+}
+
 Status SavedModelSignatureDefImporter::ConvertSignature(
-    const GraphDef& graphdef, const std::string& sig_def_key,
-    const SignatureDef& signature_def, const GraphDebugInfo& debug_info,
-    const FunctionLibraryDefinition& flib_def) {
+    const std::string& sig_def_key, const SignatureDef& signature_def) {
   // Create local vectors for the input and output and sort them to be
   // deterministic. We don't want anyone to really depend on the order, client
   // should lookup argument/result mapping by attribute name.
@@ -3339,34 +3444,9 @@ Status SavedModelSignatureDefImporter::ConvertSignature(
     return lhs.first.size() < rhs.first.size() || lhs.first > rhs.first;
   });
 
-  GraphImportConfig specs;
-  specs.prune_unused_nodes = true;
-  specs.inputs = ParseInputArrays(inputs);
-  for (auto& output : outputs) specs.outputs.push_back(output.second.name());
-
-  // Remove unused nodes and create sub-graphdef.
-  GraphDef sub_graph_def;
-  TF_RETURN_IF_ERROR(tensorflow::grappler::SetTransitiveFaninGraph(
-      graphdef, &sub_graph_def,
-      /*terminal_nodes=*/{specs.outputs.begin(), specs.outputs.end()}));
-
-  // Set the function library definitions in the pruned graphdef.
-  *sub_graph_def.mutable_library() = flib_def.ToProto();
-
-  // Convert sub-graphdef to sub-graph.
-  GraphConstructorOptions options;
-  options.allow_internal_ops = true;
-  options.add_default_attributes = true;
-  Graph sub_graph(OpRegistry::Global());
-
-  TF_RETURN_IF_ERROR(
-      ConvertGraphDefToGraph(options, sub_graph_def, &sub_graph));
-
   // Convert sub-graph to MLIR module.
-  TF_ASSIGN_OR_RETURN(
-      auto sub_module,
-      GraphDefImporter::Convert(module_->getContext(), sub_graph, debug_info,
-                                flib_def, specs, sig_def_key));
+  TF_ASSIGN_OR_RETURN(auto sub_module,
+                      ConvertGraph(sig_def_key, inputs, outputs, {}));
   mlir::OpBuilder builder(sub_module->getBodyRegion());
 
   // Find the FuncOp which corresponds to current SignatureDef.
@@ -3399,16 +3479,28 @@ Status SavedModelSignatureDefImporter::ConvertSignature(
       sub_block->begin(), mlir::Block::iterator(sub_block->getTerminator()));
 
   return Status::OK();
-}
+}  // namespace
 
 Status SavedModelSignatureDefImporter::LiftVariables() {
   VarGlobalMap var_globals;
+  llvm::SmallVector<VarHandleOp, 4> init_vars;
 
-  auto walker = [&var_globals](mlir::Operation* op) {
-    if (auto var_handle_op = llvm::dyn_cast<VarHandleOp>(op))
-      var_globals[var_handle_op.shared_name()].second.push_back(var_handle_op);
-    else if (op->getName().getStringRef() == "tf.VariableV2")
+  auto session_initializer =
+      mlir::tf_saved_model::GetSessionInitializerOp(*module_);
+
+  auto walker = [&var_globals, &init_vars,
+                 &session_initializer](mlir::Operation* op) {
+    if (auto var_handle_op = llvm::dyn_cast<VarHandleOp>(op)) {
+      if (session_initializer &&
+          session_initializer.initializer() ==
+              var_handle_op.getParentOfType<mlir::FuncOp>().getName())
+        init_vars.push_back(var_handle_op);
+      else
+        var_globals[var_handle_op.shared_name()].second.push_back(
+            var_handle_op);
+    } else if (op->getName().getStringRef() == "tf.VariableV2") {
       return mlir::WalkResult::interrupt();
+    }
     return mlir::WalkResult::advance();
   };
   bool contains_ref_variable = module_->walk(walker).wasInterrupted();
@@ -3425,9 +3517,51 @@ Status SavedModelSignatureDefImporter::LiftVariables() {
     for (VarHandleOp var_handle : it.second.second)
       LiftVariable(var_handle, it.second.first);
 
+  for (auto op : init_vars) RemoveVariable(op);
+
   return Status::OK();
 }
 
+Status SavedModelSignatureDefImporter::CoarsenIslands() {
+  mlir::StatusScopedDiagnosticHandler diag_handler(module_->getContext());
+
+  mlir::PassManager pm(module_->getContext());
+  pm.addNestedPass<mlir::FuncOp>(
+      mlir::tf_executor::CreateTFExecutorIslandCoarseningPass());
+  if (mlir::failed(pm.run(*module_)))
+    return diag_handler.Combine(
+        errors::Internal("failed to coarsening islands."));
+
+  return Status::OK();
+}
+
+void SavedModelSignatureDefImporter::RemoveVariable(VarHandleOp op) {
+  llvm::SmallVector<mlir::Operation*, 4> work_list;
+  work_list.push_back(op);
+  while (!work_list.empty()) {
+    auto* op = work_list.back();
+    work_list.pop_back();
+
+    for (mlir::Value res : op->getResults()) {
+      for (mlir::Operation* user : res.getUsers()) {
+        work_list.push_back(user);
+      }
+    }
+
+    for (auto& use : op->getOpOperands()) {
+      if (mlir::Value value = use.get()) {
+        mlir::Operation* def = value.getDefiningOp();
+        work_list.push_back(def);
+      }
+    }
+
+    op->dropAllReferences();
+    op->dropAllDefinedValueUses();
+
+    op->erase();
+  }
+}
+
 void SavedModelSignatureDefImporter::LiftVariable(
     VarHandleOp op, GlobalTensorOp global_tensor) {
   mlir::OpBuilder builder(&module_->getBodyRegion());
@@ -3460,12 +3594,7 @@ void SavedModelSignatureDefImporter::LiftVariable(
   // Add the newly added function param to entry block's arguments.
   auto new_value = func_op.front().addArgument(resource_type);
 
-  // Remove the VarHandleOp also updating the containing island's return type.
-  DCHECK(llvm::isa<mlir::tf_executor::IslandOp>(op.getParentOp()));
-  DCHECK(llvm::cast<mlir::tf_executor::IslandOp>(op.getParentOp())
-             .WrapsSingleOp());
   op.getOperation()->replaceAllUsesWith(llvm::ArrayRef<mlir::Value>(new_value));
-  op.getParentOp()->getResult(0).setType(resource_type);
   op.getOperation()->erase();
 }
 
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/tf_functional_to_executor.cc b/tensorflow/compiler/mlir/tensorflow/translate/tf_functional_to_executor.cc
index 29f98de6448..78019119d9d 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/tf_functional_to_executor.cc
+++ b/tensorflow/compiler/mlir/tensorflow/translate/tf_functional_to_executor.cc
@@ -46,13 +46,13 @@ struct FunctionalToExecutorDialectConversion
 }  // end anonymous namespace
 
 void FunctionalToExecutorDialectConversion::runOnFunction() {
-  if (getFunction().getBlocks().size() != 1) {
+  if (!llvm::hasSingleElement(getFunction())) {
     LLVM_DEBUG(llvm::dbgs() << "Expect single block function, skip conversion "
                                "to tf_executor dialect\n");
     return;
   }
   auto loc = getFunction().getLoc();
-  mlir::Block& body = getFunction().getBody().front();
+  mlir::Block& body = getFunction().front();
   // Find region of interest and ReturnOp.
   auto copy_range = body.without_terminator();
   if (copy_range.begin() != copy_range.end() &&
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/translate_tf_dialect_op.cc b/tensorflow/compiler/mlir/tensorflow/translate/translate_tf_dialect_op.cc
index bd3fe9876ff..5236bdeffbf 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/translate_tf_dialect_op.cc
+++ b/tensorflow/compiler/mlir/tensorflow/translate/translate_tf_dialect_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/Support/ToolOutputFile.h"
 #include "mlir/IR/Function.h"  // from @llvm-project
 #include "mlir/IR/Location.h"  // from @llvm-project
@@ -26,12 +27,12 @@ static mlir::Operation* ExtractOnlyOp(mlir::ModuleOp module) {
   mlir::FuncOp fn = module.lookupSymbol<mlir::FuncOp>("main");
   if (!fn) return nullptr;
 
-  if (fn.getBlocks().size() != 1) return nullptr;
+  if (!llvm::hasSingleElement(fn)) return nullptr;
 
   // Here, modules with exactly two operations in the only basic block are
   // supported. The last operation should be a terminator operation and the
   // other operation is the operation of interest.
-  auto& block = fn.getBlocks().front();
+  auto& block = fn.front();
   if (block.getOperations().size() != 2) return nullptr;
   if (!block.back().isKnownTerminator()) return nullptr;
 
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util.cc b/tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util.cc
index fd1ba3b1901..dac2fea87e2 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util.cc
@@ -267,9 +267,6 @@ Status ConvertMLIRToXlaComputation(
     const XlaCompiler::ShapeRepresentationFn shape_representation_fn,
     std::vector<std::unique_ptr<mlir::Pass>> custom_legalization_passes) {
   mlir::PassManager tf2xla(module_op.getContext());
-  // Mark main function as public, and other functions as private.
-  tf2xla.addPass(
-      mlir::TF::CreateMarkOnlyMainFunctionWithPublicVisibilityPass());
   tf2xla.addNestedPass<mlir::FuncOp>(mlir::createCanonicalizerPass());
   tf2xla.addPass(mlir::TF::CreateTensorListOpsDecompositionPass());
   tf2xla.addPass(mlir::TF::CreateStackOpsDecompositionPass());
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc b/tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc
index 797687ea658..febf2bc096d 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc
@@ -41,7 +41,7 @@ std::string MakeUniqueFilename(string name) {
   static NameCounts& instance = *new NameCounts;
 
   // Remove illegal characters from `name`.
-  for (int i = 0; i < name.size(); ++i) {
+  for (int i = 0, e = name.size(); i < e; ++i) {
     char ch = name[i];
     if (ch == '/' || ch == '[' || ch == ']' || ch == '*' || ch == '?' ||
         ch == '\\') {
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/BUILD b/tensorflow/compiler/mlir/tools/kernel_gen/BUILD
index cebfa7cd9d4..80b597d962d 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/BUILD
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/BUILD
@@ -44,8 +44,9 @@ tf_cc_binary(
     visibility = ["//tensorflow/core/kernels/cubin_headers:__pkg__"],
     deps = [
         ":cubin_creator",
-        "//tensorflow/core:framework_internal",
+        "//tensorflow/compiler/mlir:init_mlir",
         "//tensorflow/core:lib",
         "@com_google_absl//absl/strings",
+        "@llvm-project//llvm:Support",
     ],
 )
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/cubin_creator.cc b/tensorflow/compiler/mlir/tools/kernel_gen/cubin_creator.cc
index b534b5a5604..85a53e042e1 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/cubin_creator.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/cubin_creator.cc
@@ -135,11 +135,11 @@ Status LowerTfOpToLhloWithDynamicShapes(mlir::ModuleOp module) {
   return Status::OK();
 }
 
-struct PropagateStaticKnowledge
-    : public mlir::PassWrapper<PropagateStaticKnowledge,
+struct PropagateTensorFlowABIKnowledge
+    : public mlir::PassWrapper<PropagateTensorFlowABIKnowledge,
                                mlir::OperationPass<mlir::LLVM::LLVMFuncOp>> {
-  explicit PropagateStaticKnowledge(mlir::FunctionType type,
-                                    llvm::ArrayRef<uint32_t> same_shape_)
+  explicit PropagateTensorFlowABIKnowledge(mlir::FunctionType type,
+                                           llvm::ArrayRef<uint32_t> same_shape_)
       : func_type(type), same_shape(same_shape_) {}
 
   void runOnOperation() override {
@@ -148,6 +148,11 @@ struct PropagateStaticKnowledge
     // we insert constants into the code and replace usages accordingly.
     // We do not change the signature so that we keep a somewhat stable ABI
     // that is easy to undertand by tools.
+    // We also know that tensorflow aligns all allocated pointers by 16, so
+    // we pass this on. Furthermore, we know that arguments never alias. More
+    // precicely, they may only alias (due to reuse) if the kernel does not
+    // read from a position it previously has written to. We express this with
+    // the noalias attribute.
     mlir::LLVM::LLVMFuncOp func = getOperation();
 
     // This only works if the function is local and we can rewrite it.
@@ -172,6 +177,9 @@ struct PropagateStaticKnowledge
         return;
       }
       positions.push_back(arg_pos);
+      // Set alignment and aliasing on the pointers.
+      func.setArgAttr(arg_pos + 1, "llvm.noalias", b.getBoolAttr(true));
+      func.setArgAttr(arg_pos + 1, "llvm.align", b.getIndexAttr(16));
       // Replace the offset with zero. Offset is argument number 3.
       func.getArgument(arg_pos + 2).replaceAllUsesWith(zero);
       // Forward over base_ptr, aligned_ptr, offset, size and stride arguments.
@@ -213,7 +221,7 @@ struct PropagateStaticKnowledge
   llvm::ArrayRef<uint32_t> same_shape;
 };
 
-Status PropagateStaticShapeKnowledgeToKernel(
+Status PropagateTensorFlowABIKnowledgeToKernel(
     mlir::ModuleOp module, llvm::ArrayRef<uint32_t> same_shape) {
   // Grab the original signature from the single function.
   auto func = *module.getBody()->op_begin<mlir::FuncOp>();
@@ -228,7 +236,8 @@ Status PropagateStaticShapeKnowledgeToKernel(
                       /*printAfterOnlyOnChange=*/false, llvm::dbgs());
   auto& kernel_pm = pm.nest<::mlir::gpu::GPUModuleOp>();
   kernel_pm.addNestedPass<mlir::LLVM::LLVMFuncOp>(
-      absl::make_unique<PropagateStaticKnowledge>(func.getType(), same_shape));
+      absl::make_unique<PropagateTensorFlowABIKnowledge>(func.getType(),
+                                                         same_shape));
 
   if (failed(pm.run(module))) {
     return InternalError("Static knowledge propagation failed.");
@@ -259,11 +268,12 @@ StatusOr<std::vector<uint8_t>> tensorflow::kernel_gen::GenerateCubinForTfCode(
     options.tile_sizes = tile_sizes;
     options.unroll_factors = unroll_factors;
     options.collapse_parallel_loops = false;
+    options.use_approximations = true;
     TF_RETURN_IF_ERROR(xla::mlir_gpu::LowerLHLOToGPU(module.get(), options));
   }
   TF_RETURN_IF_ERROR(xla::mlir_gpu::LowerKernelBodiesToNVVM(module.get()));
   TF_RETURN_IF_ERROR(
-      PropagateStaticShapeKnowledgeToKernel(module.get(), same_shape));
+      PropagateTensorFlowABIKnowledgeToKernel(module.get(), same_shape));
 
   mlir::OwningModuleRef kernel_module =
       xla::mlir_gpu::ExtractKernelModule(*module).ValueOrDie();
@@ -278,10 +288,15 @@ StatusOr<std::vector<uint8_t>> tensorflow::kernel_gen::GenerateCubinForTfCode(
   xla::HloModuleConfig config;
   config.set_debug_options(xla::GetDebugOptionsFromFlags());
 
+  auto enable_fusion = [](llvm::TargetMachine* target) {
+    target->Options.AllowFPOpFusion = llvm::FPOpFusion::FPOpFusionMode::Fast;
+  };
+
   TF_ASSIGN_OR_RETURN(std::string libdevice_dir, GetLibdeviceDir(config));
-  TF_ASSIGN_OR_RETURN(std::string ptx, xla::gpu::nvptx::CompileToPtx(
-                                           llvmModule.get(), compute_capability,
-                                           config, libdevice_dir));
+  TF_ASSIGN_OR_RETURN(
+      std::string ptx,
+      xla::gpu::nvptx::CompileToPtx(llvmModule.get(), compute_capability,
+                                    config, libdevice_dir, enable_fusion));
   VLOG(1) << ptx;
 
 #if GOOGLE_CUDA
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/tf_to_cubin.cc b/tensorflow/compiler/mlir/tools/kernel_gen/tf_to_cubin.cc
index 66fcabde0ac..96831689600 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/tf_to_cubin.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/tf_to_cubin.cc
@@ -21,77 +21,37 @@
 #include <utility>
 #include <vector>
 
-#include "absl/strings/numbers.h"
-#include "absl/strings/str_split.h"
 #include "absl/strings/string_view.h"
+#include "llvm/Support/CommandLine.h"
+#include "tensorflow/compiler/mlir/init_mlir.h"
 #include "tensorflow/compiler/mlir/tools/kernel_gen/cubin_creator.h"
 #include "tensorflow/core/platform/env.h"
-#include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/util/command_line_flags.h"
-
-namespace {
-bool ParseStringList(std::string string_list, std::vector<uint32_t>* result) {
-  result->clear();
-  uint32_t item;
-  auto items = absl::StrSplit(string_list, ',');
-  for (const auto& item_str : items) {
-    if (!absl::SimpleAtoi(item_str, &item)) {
-      LOG(ERROR) << "Expected token " << item_str << " to be an integer";
-      return false;
-    }
-    result->push_back(item);
-  }
-  return true;
-}
-}  // namespace
 
 int main(int argc, char** argv) {
-  std::string input_file = "foo.mlir";
-  std::string output_file = "foo.bin";
-  int32_t architecture = 50;
-  std::vector<uint32_t> tile_sizes;
-  std::vector<uint32_t> unroll_factors;
-  std::vector<uint32_t> same_shape;
+  llvm::cl::opt<std::string> input_file("input", llvm::cl::desc("input file"),
+                                        llvm::cl::value_desc("filename"),
+                                        llvm::cl::init("foo.mlir"));
+  llvm::cl::opt<std::string> output_file(
+      "output", llvm::cl::desc("output file"), llvm::cl::value_desc("filename"),
+      llvm::cl::init("foo.bin"));
+  llvm::cl::opt<int32_t> architecture(
+      "arch", llvm::cl::desc("target architecture (e.g. 50 for sm_50)"),
+      llvm::cl::init(50));
+  llvm::cl::list<uint32_t> tile_sizes(
+      "tile_sizes", llvm::cl::desc("tile sizes to use"), llvm::cl::ZeroOrMore,
+      llvm::cl::CommaSeparated);
+  llvm::cl::list<uint32_t> unroll_factors(
+      "unroll_factors",
+      llvm::cl::desc("factors to unroll by, separated by commas"),
+      llvm::cl::ZeroOrMore, llvm::cl::CommaSeparated);
+  llvm::cl::list<uint32_t> same_shape(
+      "same_shape",
+      llvm::cl::desc("arguments with same shape, separated by commas"),
+      llvm::cl::ZeroOrMore, llvm::cl::CommaSeparated);
 
-  auto parse_tile_sizes = [&tile_sizes](std::string tile_sizes_str) {
-    if (!ParseStringList(tile_sizes_str, &tile_sizes)) {
-      return false;
-    }
-    // Initialize with the default.
-    if (tile_sizes.empty()) {
-      tile_sizes.push_back(16);
-      tile_sizes.push_back(64);
-    }
-    return true;
-  };
-
-  auto parse_unroll_factors =
-      [&unroll_factors](std::string unroll_factors_str) {
-        return ParseStringList(unroll_factors_str, &unroll_factors);
-      };
-
-  auto parse_same_shape = [&same_shape](std::string same_shape_str) {
-    return ParseStringList(same_shape_str, &same_shape);
-  };
-
-  std::vector<tensorflow::Flag> flag_list = {
-      tensorflow::Flag("input", &input_file, "input file"),
-      tensorflow::Flag("output", &output_file, "output file"),
-      tensorflow::Flag("arch", &architecture,
-                       "target architecture (e.g. 50 for sm_50)"),
-      tensorflow::Flag("tile_sizes", parse_tile_sizes, "16,64",
-                       "tile sizes to use"),
-      tensorflow::Flag("unroll_factors", parse_unroll_factors, "",
-                       "factors to unroll by, separated by commas"),
-      tensorflow::Flag("same_shape", parse_same_shape, "",
-                       "arguments with same shape, separated by commas"),
-  };
-  bool parse_ok = tensorflow::Flags::Parse(&argc, argv, flag_list);
-  tensorflow::port::InitMain("usage", &argc, &argv);
-  if (!parse_ok) {
-    return 1;
-  }
+  tensorflow::InitMlir y(&argc, &argv);
+  llvm::cl::ParseCommandLineOptions(argc, argv, "TF op GPU kernel generator\n");
 
   std::pair<int32_t, int32_t> compute_capability(architecture / 10,
                                                  architecture % 10);
diff --git a/tensorflow/compiler/mlir/xla/BUILD b/tensorflow/compiler/mlir/xla/BUILD
index 43458aab2d3..d089f80d571 100644
--- a/tensorflow/compiler/mlir/xla/BUILD
+++ b/tensorflow/compiler/mlir/xla/BUILD
@@ -515,6 +515,24 @@ cc_library(
     alwayslink = 1,
 )
 
+cc_library(
+    name = "xla_legalize_tanh_to_approximation",
+    srcs = ["transforms/legalize_tanh_to_approximation.cc"],
+    hdrs = [
+        "transforms/passes.h",
+        "transforms/rewriters.h",
+    ],
+    deps = [
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:StandardOps",
+        "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:Transforms",
+    ],
+    alwayslink = 1,
+)
+
 gentbl(
     name = "xla_lower_complex_inc_gen",
     tbl_outs = [
@@ -946,6 +964,7 @@ cc_library(
         ":xla_hlo_fusion",
         ":xla_hlo_to_lhlo_with_xla",
         ":xla_legalize_control_flow",
+        ":xla_legalize_tanh_to_approximation",
         ":xla_legalize_tf",
         ":xla_legalize_tf_with_tf2xla",
         ":xla_legalize_to_linalg",
diff --git a/tensorflow/compiler/mlir/xla/ir/chlo_ops.cc b/tensorflow/compiler/mlir/xla/ir/chlo_ops.cc
index 26db4549a2a..3408f3ed0cc 100644
--- a/tensorflow/compiler/mlir/xla/ir/chlo_ops.cc
+++ b/tensorflow/compiler/mlir/xla/ir/chlo_ops.cc
@@ -49,7 +49,7 @@ static Type GetBroadcastType(Type x, Type y, Type element_type,
 
   if (shape_x.size() == shape_y.size()) {
     llvm::SmallVector<int64_t, 4> out_shape(shape_x.size());
-    for (int i = 0; i < shape_x.size(); i++) {
+    for (int i = 0, e = shape_x.size(); i < e; i++) {
       auto x_val = shape_x[i];
       auto y_val = shape_y[i];
       if (x_val == -1 || y_val == -1) {
diff --git a/tensorflow/compiler/mlir/xla/ir/hlo_ops.cc b/tensorflow/compiler/mlir/xla/ir/hlo_ops.cc
index 38bff6c2ca7..e0fa1da93b8 100644
--- a/tensorflow/compiler/mlir/xla/ir/hlo_ops.cc
+++ b/tensorflow/compiler/mlir/xla/ir/hlo_ops.cc
@@ -106,53 +106,6 @@ DenseIntElementsAttr BuildSliceLimits(DenseIntElementsAttr start_indices,
   return GetI64ElementsAttr(slice_limits, builder);
 }
 
-// Returns the padding value of the given position. If padding_attr is a
-// nullptr, returns 0.
-static int64_t GetPaddingValue(DenseIntElementsAttr padding_attr,
-                               ArrayRef<uint64_t> index) {
-  if (!padding_attr) return 0;
-  return padding_attr.getValue<int64_t>(index);
-}
-
-static bool IsOnlyPaddingSpatialDims(Value lhs,
-                                     ConvDimensionNumbers dimension_numbers,
-                                     DenseIntElementsAttr edge_padding_low,
-                                     DenseIntElementsAttr edge_padding_high) {
-  const int64_t batch_dim = dimension_numbers.input_batch_dimension().getInt();
-  const int64_t feature_dim =
-      dimension_numbers.input_feature_dimension().getInt();
-  if (edge_padding_low.getValue<int64_t>(batch_dim) ||
-      edge_padding_high.getValue<int64_t>(batch_dim))
-    return false;
-  if (edge_padding_low.getValue<int64_t>(feature_dim) ||
-      edge_padding_high.getValue<int64_t>(feature_dim))
-    return false;
-  return true;
-}
-
-DenseIntElementsAttr BuildConvPaddingAttrs(
-    DenseIntElementsAttr edge_padding_low,
-    DenseIntElementsAttr edge_padding_high, DenseIntElementsAttr padding_attr,
-    ConvDimensionNumbers dimension_numbers, Builder* builder) {
-  SmallVector<int64_t, 4> padding_low, padding_high;
-  for (const auto& dim : dimension_numbers.input_spatial_dimensions()) {
-    unsigned i = dim.getZExtValue();
-    padding_low.push_back(edge_padding_low.getValue<int64_t>(i));
-    padding_high.push_back(edge_padding_high.getValue<int64_t>(i));
-  }
-
-  int rank = padding_low.size();
-  SmallVector<int64_t, 8> padding;
-  for (unsigned i = 0; i < rank; ++i) {
-    padding.push_back(GetPaddingValue(padding_attr, {i, 0}) + padding_low[i]);
-    padding.push_back(GetPaddingValue(padding_attr, {i, 1}) + padding_high[i]);
-  }
-  // padding_attr.getType() doesn't work because it is an optional attribute,
-  // which can be a nullptr.
-  auto type = RankedTensorType::get({rank, 2}, builder->getIntegerType(64));
-  return DenseIntElementsAttr::get(type, padding);
-}
-
 #include "tensorflow/compiler/mlir/xla/transforms/generated_canonicalize.inc"
 }  // namespace
 
@@ -891,7 +844,7 @@ static Attribute foldConcatenateHelper(ConcatenateOp* op,
   auto shape = type.getShape();
 
   size_t top_size = 1;
-  for (int i = 0; i < axis; i++) {
+  for (int i = 0, e = axis; i < e; i++) {
     top_size = top_size * shape[i];
   }
 
@@ -1169,7 +1122,7 @@ static LogicalResult Verify(MapOp op) {
   // increasing.
   auto values = op.dimensions().getValues<int64_t>();
   auto dimensions = std::vector<int64_t>{values.begin(), values.end()};
-  for (int i = 0; i < dimensions.size(); ++i) {
+  for (int i = 0, e = dimensions.size(); i < e; ++i) {
     if (dimensions[i] != i)
       return op.emitOpError() << "requires monotonically increasing dimension "
                                  "numbers, but got: "
@@ -2153,14 +2106,5 @@ LogicalResult deriveShapeFromFirstOperand(
   return success();
 }
 
-//===----------------------------------------------------------------------===//
-// ConvOp
-//===----------------------------------------------------------------------===//
-
-void ConvOp::getCanonicalizationPatterns(OwningRewritePatternList& results,
-                                         MLIRContext* context) {
-  results.insert<FoldPadIntoConv>(context);
-}
-
 }  // namespace xla_hlo
 }  // namespace mlir
diff --git a/tensorflow/compiler/mlir/xla/ir/hlo_ops.td b/tensorflow/compiler/mlir/xla/ir/hlo_ops.td
index b1745c73fbf..f92d1c5b85c 100644
--- a/tensorflow/compiler/mlir/xla/ir/hlo_ops.td
+++ b/tensorflow/compiler/mlir/xla/ir/hlo_ops.td
@@ -929,8 +929,6 @@ def HLO_ConvOp : HLO_Op<"convolution", [NoSideEffect]>, BASE_HLO_ConvOp {
   );
 
   let results = (outs HLO_Tensor);
-
-  let hasCanonicalizer = 1;
 }
 
 def HLO_CopyOp: HLO_Op<"copy", [NoSideEffect, SameOperandsAndResultType]>, BASE_HLO_CopyOp {
diff --git a/tensorflow/compiler/mlir/xla/ir/lhlo_ops.td b/tensorflow/compiler/mlir/xla/ir/lhlo_ops.td
index aed7c83570e..95ad97118ef 100644
--- a/tensorflow/compiler/mlir/xla/ir/lhlo_ops.td
+++ b/tensorflow/compiler/mlir/xla/ir/lhlo_ops.td
@@ -760,15 +760,6 @@ def LHLO_SortOp: LHLO_Op<"sort", []>, BASE_HLO_SortOp {
   let regions = (region SizedRegion<1>:$comparator);
 }
 
-def LHLO_TupleSelectOp: LHLO_Op<"tuple_select", [SameOperandsShape]> {
-  let arguments = (ins
-    Arg<LHLO_PredBuffer, "", [MemRead]>:$pred,
-    Arg<LHLO_Buffer, "", [MemRead]>:$on_true,
-    Arg<LHLO_Buffer, "", [MemRead]>:$on_false,
-    Arg<LHLO_Buffer, "", [MemWrite]>:$output
-  );
-}
-
 //===----------------------------------------------------------------------===//
 // Late operations
 //===----------------------------------------------------------------------===//
diff --git a/tensorflow/compiler/mlir/xla/ir/mlir_hlo_builder.cc b/tensorflow/compiler/mlir/xla/ir/mlir_hlo_builder.cc
index 21b1ac5f0ea..3c11d8e590d 100644
--- a/tensorflow/compiler/mlir/xla/ir/mlir_hlo_builder.cc
+++ b/tensorflow/compiler/mlir/xla/ir/mlir_hlo_builder.cc
@@ -132,6 +132,22 @@ StatusOr<XlaOp> MlirHloBuilder::FftInternal(
   return MakeXlaOp(op);
 }
 
+StatusOr<XlaOp> MlirHloBuilder::CustomCallInternal(
+    const string& call_target_name, absl::Span<const XlaOp> operands,
+    const Shape& shape, const string& opaque,
+    absl::optional<absl::Span<const Shape>> operand_shapes_with_layout) {
+  if (operand_shapes_with_layout.has_value())
+    return Unimplemented(
+        "CustomCall doesn't support operands shapes with layout");
+  TF_ASSIGN_OR_RETURN(mlir::Type ty, ConvertShapeToType<mlir::RankedTensorType>(
+                                         shape, builder_));
+  auto op = builder_.create<mlir::xla_hlo::CustomCallOp>(
+      loc_, ty, GetValues(operands), builder_.getStringAttr(call_target_name),
+      /*has_side_effect=*/builder_.getBoolAttr(false),
+      builder_.getStringAttr(opaque));
+  return MakeXlaOp(op);
+}
+
 StatusOr<XlaOp> MlirHloBuilder::ReduceInternal(
     const Shape& shape, absl::Span<const XlaOp> all_operands,
     const XlaComputation& computation,
diff --git a/tensorflow/compiler/mlir/xla/ir/mlir_hlo_builder.h b/tensorflow/compiler/mlir/xla/ir/mlir_hlo_builder.h
index 4b28c32db99..4d7d93af7a7 100644
--- a/tensorflow/compiler/mlir/xla/ir/mlir_hlo_builder.h
+++ b/tensorflow/compiler/mlir/xla/ir/mlir_hlo_builder.h
@@ -124,6 +124,12 @@ class MlirHloBuilder : public XlaBuilder {
                               FftType fft_type,
                               absl::Span<const int64> fft_length) override;
 
+  StatusOr<XlaOp> CustomCallInternal(const string& call_target_name,
+                                     absl::Span<const XlaOp> operands,
+                                     const Shape& shape, const string& opaque,
+                                     absl::optional<absl::Span<const Shape>>
+                                         operand_shapes_with_layout) override;
+
   StatusOr<XlaOp> ReduceInternal(
       const Shape& shape, absl::Span<const XlaOp> all_operands,
       const XlaComputation& computation,
diff --git a/tensorflow/compiler/mlir/xla/mlir_hlo_to_hlo.cc b/tensorflow/compiler/mlir/xla/mlir_hlo_to_hlo.cc
index 60d9a698731..7a576780c61 100644
--- a/tensorflow/compiler/mlir/xla/mlir_hlo_to_hlo.cc
+++ b/tensorflow/compiler/mlir/xla/mlir_hlo_to_hlo.cc
@@ -1148,13 +1148,13 @@ LogicalResult ConvertToHloModule::LowerFunctionCall(
 
 LogicalResult ConvertToHloModule::RunOnFunction(mlir::FuncOp f) {
   if (lowered_computation_.count(f)) return success();
-  if (f.getBlocks().size() != 1) {
+  if (!llvm::hasSingleElement(f)) {
     return f.emitError("only single block Function supported");
   }
 
   // Create a sub-builder if this is not the main function.
   std::unique_ptr<xla::XlaBuilder> builder_up;
-  bool entry_function = f.getName().str() == "main";
+  bool entry_function = f.getName() == "main";
   if (!entry_function)
     builder_up = module_builder_.CreateSubBuilder(f.getName().str());
   auto& builder = entry_function ? module_builder_ : *builder_up;
diff --git a/tensorflow/compiler/mlir/xla/tests/canonicalize.mlir b/tensorflow/compiler/mlir/xla/tests/canonicalize.mlir
index ef0f8c4d200..1954c3344df 100644
--- a/tensorflow/compiler/mlir/xla/tests/canonicalize.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/canonicalize.mlir
@@ -415,71 +415,6 @@ func @fold_copy(%arg : tensor<1x4xf32>) -> tensor<1x4xf32> {
   return %0 : tensor<1x4xf32>
 }
 
-// CHECK-LABEL: func @fold_pad_into_conv_f32
-func @fold_pad_into_conv_f32(%arg0 : tensor<1x32x32x3xf32>,
-                         %arg1 : tensor<7x7x3x64xf32>)
-    -> tensor<1x16x16x64xf32> {
-  //  CHECK-NOT: xla_hlo.pad
-  //      CHECK: xla_hlo.convolution
-  // CHECK-SAME: padding = dense<3> : tensor<2x2xi64>
-  %0 = xla_hlo.constant dense<0.000000e+00> : tensor<f32>
-  %1 = "xla_hlo.pad"(%arg0, %0) {
-    edge_padding_high = dense<[0, 3, 3, 0]> : tensor<4xi64>,
-    edge_padding_low = dense<[0, 3, 3, 0]> : tensor<4xi64>,
-    interior_padding = dense<0> : tensor<4xi64>
-  } : (tensor<1x32x32x3xf32>, tensor<f32>) -> tensor<1x38x38x3xf32>
-  %2 = "xla_hlo.convolution"(%1, %arg1) {
-    batch_group_count = 1 : i64,
-    dimension_numbers = {
-      input_batch_dimension = 0 : i64,
-      input_feature_dimension = 3 : i64,
-      input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>,
-      kernel_input_feature_dimension = 2 : i64,
-      kernel_output_feature_dimension = 3 : i64,
-      kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>,
-      output_batch_dimension = 0 : i64,
-      output_feature_dimension = 3 : i64,
-      output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>
-    },
-    feature_group_count = 1 : i64,
-    padding = dense<0> : tensor<2x2xi64>,
-    window_strides = dense<2> : tensor<2xi64>
-  } : (tensor<1x38x38x3xf32>, tensor<7x7x3x64xf32>) -> tensor<1x16x16x64xf32>
-  return %2 : tensor<1x16x16x64xf32>
-}
-
-// CHECK-LABEL: func @fold_pad_into_conv_i32
-func @fold_pad_into_conv_i32(%arg0 : tensor<1x32x32x3xi32>,
-                         %arg1 : tensor<7x7x3x64xi32>)
-    -> tensor<1x16x16x64xi32> {
-  //  CHECK-NOT: xla_hlo.pad
-  //      CHECK: xla_hlo.convolution
-  // CHECK-SAME: padding = dense<3> : tensor<2x2xi64>
-  %0 = xla_hlo.constant dense<0> : tensor<i32>
-  %1 = "xla_hlo.pad"(%arg0, %0) {
-    edge_padding_high = dense<[0, 3, 3, 0]> : tensor<4xi64>,
-    edge_padding_low = dense<[0, 3, 3, 0]> : tensor<4xi64>,
-    interior_padding = dense<0> : tensor<4xi64>
-  } : (tensor<1x32x32x3xi32>, tensor<i32>) -> tensor<1x38x38x3xi32>
-  %2 = "xla_hlo.convolution"(%1, %arg1) {
-    batch_group_count = 1 : i64,
-    dimension_numbers = {
-      input_batch_dimension = 0 : i64,
-      input_feature_dimension = 3 : i64,
-      input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>,
-      kernel_input_feature_dimension = 2 : i64,
-      kernel_output_feature_dimension = 3 : i64,
-      kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>,
-      output_batch_dimension = 0 : i64,
-      output_feature_dimension = 3 : i64,
-      output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>
-    },
-    feature_group_count = 1 : i64,
-    window_strides = dense<2> : tensor<2xi64>
-  } : (tensor<1x38x38x3xi32>, tensor<7x7x3x64xi32>) -> tensor<1x16x16x64xi32>
-  return %2 : tensor<1x16x16x64xi32>
-}
-
 // CHECK-LABEL: func @dynamic_reshape_not_actually_dynamic
 func @dynamic_reshape_not_actually_dynamic(%arg0: tensor<4xf32>, %shape: tensor<2xindex>) -> tensor<4x1xf32> {
   // CHECK: xla_hlo.reshape
diff --git a/tensorflow/compiler/mlir/xla/tests/legalize-tf-with-tf2xla.mlir b/tensorflow/compiler/mlir/xla/tests/legalize-tf-with-tf2xla.mlir
index b8a6df54519..86a7f2b9e09 100644
--- a/tensorflow/compiler/mlir/xla/tests/legalize-tf-with-tf2xla.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/legalize-tf-with-tf2xla.mlir
@@ -35,7 +35,7 @@ func @not_whitelisted_op(%arg0: tensor<3xi32>, %arg1: tensor<i32>, %arg2: tensor
 // CHECK-LABEL: unranked_operand
 func @unranked_operand(%arg0: tensor<*xf32>) -> tensor<*xf32> {
   // CHECK: tf.Abs
-  // expected-remark@+1 {{lowering requires static shaped operands}}
+  // expected-remark@+1 {{lowering requires static shaped tensor operands}}
   %0 = "tf.Abs"(%arg0) : (tensor<*xf32>) -> tensor<*xf32>
 
   return %0 : tensor<*xf32>
@@ -44,12 +44,20 @@ func @unranked_operand(%arg0: tensor<*xf32>) -> tensor<*xf32> {
 // CHECK-LABEL: dynamic_operand
 func @dynamic_operand(%arg0: tensor<?xf32>) -> tensor<?xf32> {
   // CHECK: tf.Abs
-  // expected-remark@+1 {{lowering requires static shaped operands}}
+  // expected-remark@+1 {{lowering requires static shaped tensor operands}}
   %0 = "tf.Abs"(%arg0) : (tensor<?xf32>) -> tensor<?xf32>
 
   return %0 : tensor<?xf32>
 }
 
+// CHECK-LABEL: tuple_type
+func @tuple_type(%arg0: tuple<tensor<f32>, tensor<i32>>) -> tensor<f32> {
+  // Verifies that the pass can handle operands of non-tensor type like tuple
+  // from non TensorFlow ops.
+  %0 = "xla_hlo.get_tuple_element"(%arg0) {index = 0 : i32} : (tuple<tensor<f32>, tensor<i32>>) -> tensor<f32>
+  return %0 : tensor<f32>
+}
+
 // CHECK-LABEL: unsupported_dtype
 func @unsupported_dtype(%arg0: tensor<2x!tf.variant>) -> tensor<2x!tf.variant> {
   // CHECK: tf.AddN
diff --git a/tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir b/tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir
index 8d6969dd669..2cd98ea3f6b 100644
--- a/tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir
@@ -814,6 +814,13 @@ func @preventgradient(%arg0: tensor<1xi32>) -> tensor<1xi32> {
   return %0: tensor<1xi32>
 }
 
+// CHECK-LABEL: func @checkNumerics
+func @checkNumerics(%arg0: tensor<1xf32>) -> tensor<1xf32> {
+  // CHECK-NEXT:  return %arg0 : tensor<1xf32>
+  %0 = "tf.CheckNumerics"(%arg0) {message = "check numerics"} : (tensor<1xf32>) -> tensor<1xf32>
+  return %0: tensor<1xf32>
+}
+
 //===----------------------------------------------------------------------===//
 // InfeedDequeueTuple legalization
 //===----------------------------------------------------------------------===//
diff --git a/tensorflow/compiler/mlir/xla/tests/legalize_tanh_to_approximation.mlir b/tensorflow/compiler/mlir/xla/tests/legalize_tanh_to_approximation.mlir
new file mode 100644
index 00000000000..a8286c9b5a9
--- /dev/null
+++ b/tensorflow/compiler/mlir/xla/tests/legalize_tanh_to_approximation.mlir
@@ -0,0 +1,134 @@
+// RUN: xla-opt -xla-legalize-tanh-to-approximation -split-input-file %s | FileCheck %s
+
+func @tanh_f64(%arg0 : f64) -> f64 {
+  %res = tanh %arg0 : f64
+  return %res : f64
+}
+
+// CHECK-LABEL: @tanh_f64
+// CHECK: tanh
+
+// -----
+
+func @tanh_f32(%arg0 : f32) -> f32 {
+  %res = tanh %arg0 : f32
+  return %res : f32
+}
+
+// NOTE: Assertions have been autogenerated by utils/generate-test-checks.py
+// CHECK:       module {
+
+// CHECK-LABEL:   func @tanh_f32(
+// CHECK-SAME:                   %[[VAL_0:.*]]: f32) -> f32 {
+// CHECK:           %[[VAL_1:.*]] = constant 2.000000e+01 : f32
+// CHECK:           %[[VAL_2:.*]] = constant 1.000000e+00 : f32
+// CHECK:           %[[VAL_3:.*]] = constant 4.000000e-04 : f32
+// CHECK:           %[[VAL_4:.*]] = constant 9.000000e+00 : f32
+// CHECK:           %[[VAL_5:.*]] = constant -2.76076837E-16 : f32
+// CHECK:           %[[VAL_6:.*]] = constant 2.00018794E-13 : f32
+// CHECK:           %[[VAL_7:.*]] = constant -8.60467184E-11 : f32
+// CHECK:           %[[VAL_8:.*]] = constant 5.12229725E-8 : f32
+// CHECK:           %[[VAL_9:.*]] = constant 1.48572235E-5 : f32
+// CHECK:           %[[VAL_10:.*]] = constant 6.37261954E-4 : f32
+// CHECK:           %[[VAL_11:.*]] = constant 0.00489352457 : f32
+// CHECK:           %[[VAL_12:.*]] = constant 1.19825836E-6 : f32
+// CHECK:           %[[VAL_13:.*]] = constant 1.18534706E-4 : f32
+// CHECK:           %[[VAL_14:.*]] = constant 0.00226843474 : f32
+// CHECK:           %[[VAL_15:.*]] = constant 0.00489352504 : f32
+// CHECK:           %[[VAL_16:.*]] = absf %[[VAL_0]] : f32
+// CHECK:           %[[VAL_17:.*]] = copysign %[[VAL_2]], %[[VAL_0]] : f32
+// CHECK:           %[[VAL_18:.*]] = cmpf "ult", %[[VAL_16]], %[[VAL_1]] : f32
+// CHECK:           %[[VAL_19:.*]] = cmpf "olt", %[[VAL_16]], %[[VAL_3]] : f32
+// CHECK:           %[[VAL_20:.*]] = cmpf "ule", %[[VAL_16]], %[[VAL_4]] : f32
+// CHECK:           %[[VAL_21:.*]] = copysign %[[VAL_4]], %[[VAL_0]] : f32
+// CHECK:           %[[VAL_22:.*]] = select %[[VAL_20]], %[[VAL_0]], %[[VAL_21]] : f32
+// CHECK:           %[[VAL_23:.*]] = mulf %[[VAL_22]], %[[VAL_22]] : f32
+// CHECK:           %[[VAL_24:.*]] = mulf %[[VAL_23]], %[[VAL_5]] : f32
+// CHECK:           %[[VAL_25:.*]] = addf %[[VAL_24]], %[[VAL_6]] : f32
+// CHECK:           %[[VAL_26:.*]] = mulf %[[VAL_23]], %[[VAL_25]] : f32
+// CHECK:           %[[VAL_27:.*]] = addf %[[VAL_26]], %[[VAL_7]] : f32
+// CHECK:           %[[VAL_28:.*]] = mulf %[[VAL_23]], %[[VAL_27]] : f32
+// CHECK:           %[[VAL_29:.*]] = addf %[[VAL_28]], %[[VAL_8]] : f32
+// CHECK:           %[[VAL_30:.*]] = mulf %[[VAL_23]], %[[VAL_29]] : f32
+// CHECK:           %[[VAL_31:.*]] = addf %[[VAL_30]], %[[VAL_9]] : f32
+// CHECK:           %[[VAL_32:.*]] = mulf %[[VAL_23]], %[[VAL_31]] : f32
+// CHECK:           %[[VAL_33:.*]] = addf %[[VAL_32]], %[[VAL_10]] : f32
+// CHECK:           %[[VAL_34:.*]] = mulf %[[VAL_23]], %[[VAL_33]] : f32
+// CHECK:           %[[VAL_35:.*]] = addf %[[VAL_34]], %[[VAL_11]] : f32
+// CHECK:           %[[VAL_36:.*]] = mulf %[[VAL_22]], %[[VAL_35]] : f32
+// CHECK:           %[[VAL_37:.*]] = mulf %[[VAL_23]], %[[VAL_12]] : f32
+// CHECK:           %[[VAL_38:.*]] = addf %[[VAL_37]], %[[VAL_13]] : f32
+// CHECK:           %[[VAL_39:.*]] = mulf %[[VAL_23]], %[[VAL_38]] : f32
+// CHECK:           %[[VAL_40:.*]] = addf %[[VAL_39]], %[[VAL_14]] : f32
+// CHECK:           %[[VAL_41:.*]] = mulf %[[VAL_23]], %[[VAL_40]] : f32
+// CHECK:           %[[VAL_42:.*]] = addf %[[VAL_41]], %[[VAL_15]] : f32
+// CHECK:           %[[VAL_43:.*]] = divf %[[VAL_36]], %[[VAL_42]] : f32
+// CHECK:           %[[VAL_44:.*]] = select %[[VAL_19]], %[[VAL_0]], %[[VAL_43]] : f32
+// CHECK:           %[[VAL_45:.*]] = select %[[VAL_18]], %[[VAL_44]], %[[VAL_17]] : f32
+// CHECK:           return %[[VAL_45]] : f32
+// CHECK:         }
+// CHECK:       }
+
+// -----
+
+func @tanh_f16(%arg0 : f16) -> f16 {
+  %res = tanh %arg0 : f16
+  return %res : f16
+}
+
+// NOTE: Assertions have been autogenerated by utils/generate-test-checks.py
+// CHECK:       module {
+
+// CHECK-LABEL:   func @tanh_f16(
+// CHECK-SAME:                   %[[VAL_0:.*]]: f16) -> f16 {
+// CHECK:           %[[VAL_1:.*]] = constant 2.000000e+01 : f32
+// CHECK:           %[[VAL_2:.*]] = constant 1.000000e+00 : f32
+// CHECK:           %[[VAL_3:.*]] = constant 4.000000e-04 : f32
+// CHECK:           %[[VAL_4:.*]] = constant 9.000000e+00 : f32
+// CHECK:           %[[VAL_5:.*]] = constant -2.76076837E-16 : f32
+// CHECK:           %[[VAL_6:.*]] = constant 2.00018794E-13 : f32
+// CHECK:           %[[VAL_7:.*]] = constant -8.60467184E-11 : f32
+// CHECK:           %[[VAL_8:.*]] = constant 5.12229725E-8 : f32
+// CHECK:           %[[VAL_9:.*]] = constant 1.48572235E-5 : f32
+// CHECK:           %[[VAL_10:.*]] = constant 6.37261954E-4 : f32
+// CHECK:           %[[VAL_11:.*]] = constant 0.00489352457 : f32
+// CHECK:           %[[VAL_12:.*]] = constant 1.19825836E-6 : f32
+// CHECK:           %[[VAL_13:.*]] = constant 1.18534706E-4 : f32
+// CHECK:           %[[VAL_14:.*]] = constant 0.00226843474 : f32
+// CHECK:           %[[VAL_15:.*]] = constant 0.00489352504 : f32
+// CHECK:           %[[VAL_16:.*]] = fpext %[[VAL_0]] : f16 to f32
+// CHECK:           %[[VAL_17:.*]] = absf %[[VAL_16]] : f32
+// CHECK:           %[[VAL_18:.*]] = copysign %[[VAL_2]], %[[VAL_16]] : f32
+// CHECK:           %[[VAL_19:.*]] = cmpf "ult", %[[VAL_17]], %[[VAL_1]] : f32
+// CHECK:           %[[VAL_20:.*]] = cmpf "olt", %[[VAL_17]], %[[VAL_3]] : f32
+// CHECK:           %[[VAL_21:.*]] = cmpf "ule", %[[VAL_17]], %[[VAL_4]] : f32
+// CHECK:           %[[VAL_22:.*]] = copysign %[[VAL_4]], %[[VAL_16]] : f32
+// CHECK:           %[[VAL_23:.*]] = select %[[VAL_21]], %[[VAL_16]], %[[VAL_22]] : f32
+// CHECK:           %[[VAL_24:.*]] = mulf %[[VAL_23]], %[[VAL_23]] : f32
+// CHECK:           %[[VAL_25:.*]] = mulf %[[VAL_24]], %[[VAL_5]] : f32
+// CHECK:           %[[VAL_26:.*]] = addf %[[VAL_25]], %[[VAL_6]] : f32
+// CHECK:           %[[VAL_27:.*]] = mulf %[[VAL_24]], %[[VAL_26]] : f32
+// CHECK:           %[[VAL_28:.*]] = addf %[[VAL_27]], %[[VAL_7]] : f32
+// CHECK:           %[[VAL_29:.*]] = mulf %[[VAL_24]], %[[VAL_28]] : f32
+// CHECK:           %[[VAL_30:.*]] = addf %[[VAL_29]], %[[VAL_8]] : f32
+// CHECK:           %[[VAL_31:.*]] = mulf %[[VAL_24]], %[[VAL_30]] : f32
+// CHECK:           %[[VAL_32:.*]] = addf %[[VAL_31]], %[[VAL_9]] : f32
+// CHECK:           %[[VAL_33:.*]] = mulf %[[VAL_24]], %[[VAL_32]] : f32
+// CHECK:           %[[VAL_34:.*]] = addf %[[VAL_33]], %[[VAL_10]] : f32
+// CHECK:           %[[VAL_35:.*]] = mulf %[[VAL_24]], %[[VAL_34]] : f32
+// CHECK:           %[[VAL_36:.*]] = addf %[[VAL_35]], %[[VAL_11]] : f32
+// CHECK:           %[[VAL_37:.*]] = mulf %[[VAL_23]], %[[VAL_36]] : f32
+// CHECK:           %[[VAL_38:.*]] = mulf %[[VAL_24]], %[[VAL_12]] : f32
+// CHECK:           %[[VAL_39:.*]] = addf %[[VAL_38]], %[[VAL_13]] : f32
+// CHECK:           %[[VAL_40:.*]] = mulf %[[VAL_24]], %[[VAL_39]] : f32
+// CHECK:           %[[VAL_41:.*]] = addf %[[VAL_40]], %[[VAL_14]] : f32
+// CHECK:           %[[VAL_42:.*]] = mulf %[[VAL_24]], %[[VAL_41]] : f32
+// CHECK:           %[[VAL_43:.*]] = addf %[[VAL_42]], %[[VAL_15]] : f32
+// CHECK:           %[[VAL_44:.*]] = divf %[[VAL_37]], %[[VAL_43]] : f32
+// CHECK:           %[[VAL_45:.*]] = select %[[VAL_20]], %[[VAL_16]], %[[VAL_44]] : f32
+// CHECK:           %[[VAL_46:.*]] = select %[[VAL_19]], %[[VAL_45]], %[[VAL_18]] : f32
+// CHECK:           %[[VAL_47:.*]] = fptrunc %[[VAL_46]] : f32 to f16
+// CHECK:           return %[[VAL_47]] : f16
+// CHECK:         }
+// CHECK:       }
+
diff --git a/tensorflow/compiler/mlir/xla/tests/lhlo_ops.mlir b/tensorflow/compiler/mlir/xla/tests/lhlo_ops.mlir
index 0ed8b36466e..1e803da4ac6 100644
--- a/tensorflow/compiler/mlir/xla/tests/lhlo_ops.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/lhlo_ops.mlir
@@ -964,23 +964,3 @@ func @sort_memrefs(%arg0: memref<16x16xf32>, %arg1: memref<16x16xf16>,
   }) : (memref<16x16xf32>, memref<16x16xf16>, tuple<memref<16x16xf32>, memref<16x16xf16>>) -> ()
   return
 }
-
-// -----
-
-// CHECK-LABEL: func @tuple_select_memrefs
-func @tuple_select_memrefs(%pred: memref<20xi1>, %true_values: memref<20xf32>,
-                           %false_values: memref<20xf32>, %arg_out: memref<20xf32>) -> () {
-  "xla_lhlo.tuple_select"(%pred, %true_values, %false_values, %arg_out)
-      : (memref<20xi1>, memref<20xf32>, memref<20xf32>, memref<20xf32>) -> ()
-  return
-}
-
-// -----
-
-func @tuple_select_memrefs(%pred: memref<10xi1>, %true_values: memref<20xf32>,
-                           %false_values: memref<20xf32>, %arg_out: memref<20xf32>) -> () {
-  // expected-error@+1{{requires the same shape for all operands}}
-  "xla_lhlo.tuple_select"(%pred, %true_values, %false_values, %arg_out)
-      : (memref<10xi1>, memref<20xf32>, memref<20xf32>, memref<20xf32>) -> ()
-  return
-}
diff --git a/tensorflow/compiler/mlir/xla/transforms/canonicalize.td b/tensorflow/compiler/mlir/xla/transforms/canonicalize.td
index b788cb80380..c319551d92a 100644
--- a/tensorflow/compiler/mlir/xla/transforms/canonicalize.td
+++ b/tensorflow/compiler/mlir/xla/transforms/canonicalize.td
@@ -28,54 +28,3 @@ def UnaryEinsumToEinsum : Pat<
   (HLO_UnaryEinsumOp $operand, $equation),
   (HLO_EinsumOp (HLO_ConstOp (GetScalarOfType<1> $operand)),
                 $operand, (UnaryToBinaryEinsumEq $equation))>;
-
-//===----------------------------------------------------------------------===//
-// Conv op patterns.
-//===----------------------------------------------------------------------===//
-
-def IsZero : Attr<CPred<
-  "($_self.isa<DenseFPElementsAttr>() &&"
-  "$_self.cast<DenseFPElementsAttr>().isSplat() &&"
-  "$_self.cast<DenseFPElementsAttr>().getSplatValue<FloatAttr>()"
-  ".getValue().isZero()) ||"
-  "($_self.isa<DenseIntElementsAttr>() &&"
-  "$_self.cast<DenseIntElementsAttr>().isSplat() &&"
-  "$_self.cast<DenseIntElementsAttr>().getSplatValue<IntegerAttr>()"
-  ".getInt() == 0)">>;
-
-def IsOnlyPaddingSpatialDims
-  : Constraint<CPred<"IsOnlyPaddingSpatialDims($0, $1, $2, $3)">>;
-
-def BuildConvPaddingAttrs : NativeCodeCall<
-  "BuildConvPaddingAttrs($0, $1, $2, $3, &$_builder)">;
-
-def FoldPadIntoConv : Pat<
-  (HLO_ConvOp
-    (HLO_PadOp $lhs,
-      (HLO_ConstOp IsZero:$padding_value),
-      $edge_padding_low,
-      $edge_padding_high,
-      IsZero:$interior_padding),
-    $rhs,
-    $window_strides,
-    $padding,
-    $lhs_dilation,
-    $rhs_dilation,
-    $dimension_numbers,
-    $feature_group_count,
-    $batch_group_count,
-    $precision_config),
-  (HLO_ConvOp
-    $lhs,
-    $rhs,
-    $window_strides,
-    (BuildConvPaddingAttrs $edge_padding_low, $edge_padding_high, $padding,
-      $dimension_numbers),
-    $lhs_dilation,
-    $rhs_dilation,
-    $dimension_numbers,
-    $feature_group_count,
-    $batch_group_count,
-    $precision_config),
-    [(IsOnlyPaddingSpatialDims $lhs, $dimension_numbers, $edge_padding_low,
-      $edge_padding_high)]>;
diff --git a/tensorflow/compiler/mlir/xla/transforms/hlo_legalize_to_lhlo.cc b/tensorflow/compiler/mlir/xla/transforms/hlo_legalize_to_lhlo.cc
index 1cfe0c12e20..7cdc0d92207 100644
--- a/tensorflow/compiler/mlir/xla/transforms/hlo_legalize_to_lhlo.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/hlo_legalize_to_lhlo.cc
@@ -230,10 +230,10 @@ struct HloToLhloReduceOpConverter : public BaseOpConversion<xla_hlo::ReduceOp> {
     auto loc = op.getLoc();
     // TODO(b/137624192) Implement variadic reduce.
     if (op.getNumResults() != 1) return failure();
-    if (op.getParentRegion()->getBlocks().size() != 1) {
-      op.emitOpError() << "tensor to buffer conversion expects a single block "
-                          "in the region containing the operation";
-      return failure();
+    if (!llvm::hasSingleElement(op.body())) {
+      return op.emitOpError()
+             << "tensor to buffer conversion expects a single block "
+                "in the region containing the operation";
     }
     const auto& original_results = op.getResults();
     SmallVector<Value, 4> buffer_args(operands.begin(), operands.end());
@@ -389,10 +389,13 @@ struct HloLegalizeToLhlo
     target.addLegalOp<ModuleTerminatorOp>();
     target.addLegalOp<TensorFromElementsOp>();
     target.addIllegalDialect<xla_hlo::XlaHloDialect>();
+
+    BufferAssignmentTypeConverter converter;
     target.addDynamicallyLegalOp<FuncOp>([&](FuncOp op) {
       auto inputs = op.getType().getInputs();
-      return std::all_of(inputs.begin(), inputs.end(),
-                         [](Type input) { return input.isa<MemRefType>(); });
+      return llvm::all_of(inputs,
+                          [](Type input) { return input.isa<MemRefType>(); }) &&
+             converter.isLegal(&op.getBody());
     });
     target.addDynamicallyLegalOp<mlir::ReturnOp>([&](mlir::ReturnOp returnOp) {
       return std::all_of(returnOp.operand_type_begin(),
@@ -401,8 +404,7 @@ struct HloLegalizeToLhlo
     });
 
     auto module = getOperation();
-    BufferAssignmentTypeConverter converter;
-    module.walk([&](FuncOp func) {
+    module.walk([&](FuncOp func) -> WalkResult {
       BufferAssignmentPlacer bufferAssignment(func);
       OwningRewritePatternList patterns;
       populateHLOToLHLOConversionPattern(func.getContext(), &bufferAssignment,
@@ -418,8 +420,7 @@ struct HloLegalizeToLhlo
             /*allowMemrefFunctionResults=*/false>(&context, &bufferAssignment,
                                                   &converter, &patterns);
       }
-      return WalkResult(
-          applyPartialConversion(func, target, patterns, &converter));
+      return applyPartialConversion(func, target, patterns);
     });
   }
 
@@ -463,6 +464,7 @@ void populateHLOToLHLOConversionPattern(
       HloToLhloOpConverter<xla_hlo::RealOp>,
       HloToLhloOpConverter<xla_hlo::RemOp>,
       HloToLhloOpConverter<xla_hlo::RsqrtOp>,
+      HloToLhloOpConverter<xla_hlo::ReshapeOp>,
       HloToLhloOpConverter<xla_hlo::SelectOp>,
       HloToLhloOpConverter<xla_hlo::SignOp>,
       HloToLhloOpConverter<xla_hlo::SqrtOp>,
diff --git a/tensorflow/compiler/mlir/xla/transforms/legalize_tanh_to_approximation.cc b/tensorflow/compiler/mlir/xla/transforms/legalize_tanh_to_approximation.cc
new file mode 100644
index 00000000000..9696db377da
--- /dev/null
+++ b/tensorflow/compiler/mlir/xla/transforms/legalize_tanh_to_approximation.cc
@@ -0,0 +1,167 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This file implements logic for lowering the tanh standard ops to an
+// approximation.
+
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
+#include "mlir/IR/Function.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/xla/transforms/passes.h"
+#include "tensorflow/compiler/mlir/xla/transforms/rewriters.h"
+
+namespace mlir {
+namespace xla {
+namespace {
+
+/// Emits the fast tanh approximation that is also used by XLA.
+static Value EmitTanhApproximation(Value input, Value abs_value, Location loc,
+                                   PatternRewriter &rewriter) {
+  // For small values of x, we can approximate tanh(x)=x. For extremely small
+  // values of x (|x| < 1e-37), the other approximation would evaluate
+  // tanh(x) = 0.
+  constexpr float kCanUseApprox = 0.0004;
+  Value can_use_approx =
+      rewriter.create<ConstantOp>(loc, rewriter.getF32FloatAttr(kCanUseApprox));
+  Value return_input = rewriter.create<CmpFOp>(loc, CmpFPredicate::OLT,
+                                               abs_value, can_use_approx);
+
+  // Clamp the input to [-9, 9].
+  Value plus_nine =
+      rewriter.create<ConstantOp>(loc, rewriter.getF32FloatAttr(9.0));
+  Value smaller_than_nine =
+      rewriter.create<CmpFOp>(loc, CmpFPredicate::ULE, abs_value, plus_nine);
+  Value input_clamped = rewriter.create<SelectOp>(
+      loc, smaller_than_nine, input,
+      rewriter.create<CopySignOp>(loc, plus_nine, input));
+
+  static constexpr std::array<float, 7> numerator_coeffs{
+      -2.76076847742355e-16f, 2.00018790482477e-13f, -8.60467152213735e-11f,
+      5.12229709037114e-08f,  1.48572235717979e-05f, 6.37261928875436e-04f,
+      4.89352455891786e-03f};
+
+  static constexpr std::array<float, 4> denominator_coeffs{
+      1.19825839466702e-06f, 1.18534705686654e-04f, 2.26843463243900e-03f,
+      4.89352518554385e-03f};
+
+  Value input_squared =
+      rewriter.create<MulFOp>(loc, input_clamped, input_clamped);
+  Value numerator = rewriter.create<ConstantOp>(
+      loc, rewriter.getF32FloatAttr(numerator_coeffs[0]));
+  for (int i = 1; i < numerator_coeffs.size(); i++) {
+    numerator = rewriter.create<AddFOp>(
+        loc, rewriter.create<MulFOp>(loc, input_squared, numerator),
+        rewriter.create<ConstantOp>(
+            loc, rewriter.getF32FloatAttr(numerator_coeffs[i])));
+  }
+
+  numerator = rewriter.create<MulFOp>(loc, input_clamped, numerator);
+
+  Value denominator = rewriter.create<ConstantOp>(
+      loc, rewriter.getF32FloatAttr(denominator_coeffs[0]));
+  for (int i = 1; i < denominator_coeffs.size(); i++) {
+    denominator = rewriter.create<AddFOp>(
+        loc, rewriter.create<MulFOp>(loc, input_squared, denominator),
+        rewriter.create<ConstantOp>(
+            loc, rewriter.getF32FloatAttr(denominator_coeffs[i])));
+  }
+
+  Value approx = rewriter.create<DivFOp>(loc, numerator, denominator);
+
+  return rewriter.create<SelectOp>(loc, return_input, input, approx);
+}
+
+class ApproximateTanhLowering : public OpRewritePattern<TanhOp> {
+ public:
+  explicit ApproximateTanhLowering(MLIRContext *ctx)
+      : OpRewritePattern<TanhOp>(ctx, 100) {}
+
+  LogicalResult matchAndRewrite(TanhOp tanhOp,
+                                PatternRewriter &rewriter) const override {
+    Type operand_type = tanhOp.getType();
+
+    if (operand_type.isF64()) {
+      // Similar to XLA, do not rewrite f64 as precision might matter.
+      return failure();
+    }
+
+    Location loc = tanhOp.getLoc();
+    Value input = tanhOp.operand();
+    if (operand_type.isF16()) {
+      input = rewriter.create<FPExtOp>(loc, input, rewriter.getF32Type());
+    }
+
+    // If we still do not have f32, fail.
+    if (!input.getType().isF32()) {
+      return failure();
+    }
+
+    // For |operand| > 20.0, we just return -1/1.
+    constexpr double kMaxValue = 20.0;
+    Value max_value =
+        rewriter.create<ConstantOp>(loc, rewriter.getF32FloatAttr(kMaxValue));
+    Value abs_value = rewriter.create<AbsFOp>(loc, input);
+
+    Value one = rewriter.create<ConstantOp>(loc, rewriter.getF32FloatAttr(1.0));
+    Value one_with_sign = rewriter.create<CopySignOp>(loc, one, input);
+
+    Value smaller_than_twenty =
+        rewriter.create<CmpFOp>(loc, CmpFPredicate::ULT, abs_value, max_value);
+
+    // Otherwise, we use the approximation.
+    Value approx = EmitTanhApproximation(input, abs_value, loc, rewriter);
+
+    Value result = rewriter.create<SelectOp>(loc, smaller_than_twenty, approx,
+                                             one_with_sign);
+
+    // Truncate back if needed.
+    if (operand_type.isF16()) {
+      result = rewriter.create<FPTruncOp>(loc, result, rewriter.getF16Type());
+    }
+
+    rewriter.replaceOp(tanhOp, {result});
+    return success();
+  }
+};
+
+struct LegalizeTanhToApproximation
+    : public PassWrapper<LegalizeTanhToApproximation, FunctionPass> {
+  /// Perform the lowering of standard dialect operations to approximations.
+  void runOnFunction() override {
+    OwningRewritePatternList patterns;
+    PopulateTanhToApproximationPatterns(&getContext(), &patterns);
+    applyPatternsAndFoldGreedily(getFunction(), patterns);
+  }
+};
+
+}  // anonymous namespace
+
+std::unique_ptr<mlir::OperationPass<mlir::FuncOp>>
+createLegalizeTanhToApproximationPass() {
+  return std::make_unique<LegalizeTanhToApproximation>();
+}
+
+void PopulateTanhToApproximationPatterns(mlir::MLIRContext *context,
+                                         OwningRewritePatternList *patterns) {
+  patterns->insert<ApproximateTanhLowering>(context);
+}
+
+static PassRegistration<LegalizeTanhToApproximation> legalize_pass(
+    "xla-legalize-tanh-to-approximation",
+    "Legalize tanh from standard dialect to an approximation");
+
+}  // namespace xla
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/xla/transforms/legalize_tf.cc b/tensorflow/compiler/mlir/xla/transforms/legalize_tf.cc
index b7cad554043..1788cd1b270 100644
--- a/tensorflow/compiler/mlir/xla/transforms/legalize_tf.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/legalize_tf.cc
@@ -5238,8 +5238,8 @@ LogicalResult legalizeTF(Operation *op, bool allow_partial_conversion,
     // Fully qualify ReturnOp here as xla_hlo dialect also defines a ReturnOp.
     target.addLegalOp<ModuleOp, FuncOp, ModuleTerminatorOp, ::mlir::ReturnOp>();
     DenseSet<Operation *> nonlegalized_ops;
-    LogicalResult result = applyPartialConversion(
-        op, target, patterns, /*converter=*/nullptr, &nonlegalized_ops);
+    LogicalResult result =
+        applyPartialConversion(op, target, patterns, &nonlegalized_ops);
     // In order to enforce that the conversion result is fully converted,
     // fail if there are any nonlegalized ops in the set.
     if (failed(result) || !nonlegalized_ops.empty()) {
diff --git a/tensorflow/compiler/mlir/xla/transforms/legalize_tf_patterns.td b/tensorflow/compiler/mlir/xla/transforms/legalize_tf_patterns.td
index f3c432f38bd..df7b887fcad 100644
--- a/tensorflow/compiler/mlir/xla/transforms/legalize_tf_patterns.td
+++ b/tensorflow/compiler/mlir/xla/transforms/legalize_tf_patterns.td
@@ -321,7 +321,10 @@ def : Pat<(TF_PadV2Op $input, (TF_ConstOp $padding), $c),
 
 foreach src = [TF_IdentityOp, TF_StopGradientOp] in
   def : Pat<(src $op), (replaceWithValue $op)>;
-def : Pat<(TF_PreventGradientOp $op, $msg), (replaceWithValue $op)>;
+
+// TODO(b/32223192): Support CheckNumerics in HLO.
+foreach src = [TF_PreventGradientOp, TF_CheckNumericsOp] in
+  def : Pat<(src $op, $msg), (replaceWithValue $op)>;
 
 //===----------------------------------------------------------------------===//
 // MatMul op patterns.
diff --git a/tensorflow/compiler/mlir/xla/transforms/legalize_tf_with_tf2xla.cc b/tensorflow/compiler/mlir/xla/transforms/legalize_tf_with_tf2xla.cc
index e57d6938efb..54453406ef7 100644
--- a/tensorflow/compiler/mlir/xla/transforms/legalize_tf_with_tf2xla.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/legalize_tf_with_tf2xla.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/Optional.h"
+#include "llvm/ADT/STLExtras.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
 #include "mlir/IR/Diagnostics.h"  // from @llvm-project
 #include "mlir/IR/Function.h"  // from @llvm-project
@@ -88,6 +89,9 @@ static bool IsOpWhitelisted(Operation* op) {
     TypeID::get<TF::AddNOp>(),
     TypeID::get<TF::AddV2Op>(),
     TypeID::get<TF::AngleOp>(),
+    TypeID::get<TF::AdjustContrastv2Op>(),
+    TypeID::get<TF::AdjustHueOp>(),
+    TypeID::get<TF::AdjustSaturationOp>(),
     TypeID::get<TF::ApproximateEqualOp>(),
     TypeID::get<TF::ArgMaxOp>(),
     TypeID::get<TF::ArgMinOp>(),
@@ -127,6 +131,7 @@ static bool IsOpWhitelisted(Operation* op) {
     TypeID::get<TF::GatherNdOp>(),
     TypeID::get<TF::GreaterEqualOp>(),
     TypeID::get<TF::GreaterOp>(),
+    TypeID::get<TF::HSVToRGBOp>(),
     TypeID::get<TF::IFFT2DOp>(),
     TypeID::get<TF::IFFT3DOp>(),
     TypeID::get<TF::IFFTOp>(),
@@ -157,10 +162,14 @@ static bool IsOpWhitelisted(Operation* op) {
     TypeID::get<TF::PowOp>(),
     TypeID::get<TF::RFFT2DOp>(),
     TypeID::get<TF::RFFT3DOp>(),
+    TypeID::get<TF::RGBToHSVOp>(),
     TypeID::get<TF::RealDivOp>(),
     TypeID::get<TF::ReciprocalOp>(),
     TypeID::get<TF::ReciprocalGradOp>(),
     TypeID::get<TF::Relu6GradOp>(),
+    TypeID::get<TF::ResizeBilinearOp>(),
+    TypeID::get<TF::ResizeBilinearGradOp>(),
+    TypeID::get<TF::ResizeNearestNeighborOp>(),
     TypeID::get<TF::ReverseSequenceOp>(),
     TypeID::get<TF::RightShiftOp>(),
     TypeID::get<TF::RintOp>(),
@@ -312,13 +321,14 @@ LogicalResult FuncLegalizer::PrepareParams() {
 }
 
 LogicalResult FuncLegalizer::Legalize() {
+  if (func_.empty()) return success();
+
   // TensorFlow functions don't use CFGs.
-  if (func_.getBlocks().size() > 1) {
+  if (!llvm::hasSingleElement(func_)) {
     emitError(func_.getLoc()) << "requires at most one block in a TF function";
     return failure();
   }
-  if (func_.getBlocks().empty()) return success();
-  Block& block = func_.getBlocks().front();
+  Block& block = func_.front();
 
   std::vector<Operation*> ops;
   ops.reserve(block.getOperations().size());
@@ -337,9 +347,9 @@ LogicalResult FuncLegalizer::LegalizeOp(Operation* op) {
 
   // Only static shaped operands are supported in XLA builders for now.
   for (Type ty : op->getOperandTypes()) {
-    auto ranked_ty = ty.cast<ShapedType>();
-    if (!ranked_ty.hasStaticShape()) {
-      op->emitRemark() << "lowering requires static shaped operands";
+    auto ranked_ty = ty.dyn_cast<ShapedType>();
+    if (!ranked_ty || !ranked_ty.hasStaticShape()) {
+      op->emitRemark() << "lowering requires static shaped tensor operands";
       return success();
     }
   }
diff --git a/tensorflow/compiler/mlir/xla/transforms/lhlo_fuse_linalg.cc b/tensorflow/compiler/mlir/xla/transforms/lhlo_fuse_linalg.cc
index e16ab571b4d..f0971fdf76e 100644
--- a/tensorflow/compiler/mlir/xla/transforms/lhlo_fuse_linalg.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/lhlo_fuse_linalg.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include "mlir/Dialect/Linalg/Analysis/DependenceAnalysis.h"
 #include "absl/memory/memory.h"
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLExtras.h"
 #include "mlir/Dialect/Linalg/Transforms/Transforms.h"  // from @llvm-project
 #include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
@@ -44,7 +45,7 @@ class LhloFuseLinalg : public PassWrapper<LhloFuseLinalg, FunctionPass> {
     auto func = getFunction();
 
     // TODO(pifon): Remove assumption that the function has a single block.
-    if (func.getBlocks().size() != 1) {
+    if (!llvm::hasSingleElement(func)) {
       emitError(func.getLoc(), "The function needs to have a single block.");
       signalPassFailure();
       return;
@@ -58,7 +59,7 @@ class LhloFuseLinalg : public PassWrapper<LhloFuseLinalg, FunctionPass> {
     for (auto func_arg : func.getArguments()) {
       result_buffers.insert(func_arg);
     }
-    for (auto& block : func.getBlocks()) {
+    for (auto& block : func) {
       auto returnOp = mlir::dyn_cast<mlir::ReturnOp>(block.getTerminator());
       if (!returnOp) continue;
       for (auto operand : returnOp.getOperands()) {
diff --git a/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_affine.cc b/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_affine.cc
index 56b9f5879f6..904a30e847a 100644
--- a/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_affine.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_affine.cc
@@ -31,6 +31,17 @@ namespace mlir {
 namespace xla_lhlo {
 namespace {
 
+// Builds an affine loop nest iterating from zeros to "upper_bounds" with unit
+// steps, and populates the body of the innermost loop using "body_builder".
+static void BuildBoundedAffineLoopNest(
+    OpBuilder& builder, Location location, ArrayRef<int64_t> upper_bounds,
+    function_ref<void(OpBuilder&, Location, ValueRange)> body_builder) {
+  SmallVector<int64_t, 3> lower_bounds(upper_bounds.size(), /*Value=*/0);
+  SmallVector<int64_t, 3> steps(upper_bounds.size(), /*Value=*/1);
+  buildAffineLoopNest(builder, location, lower_bounds, upper_bounds, steps,
+                      body_builder);
+}
+
 struct DotOpConverter : public OpRewritePattern<DotOp> {
   using OpRewritePattern<DotOp>::OpRewritePattern;
 
@@ -48,37 +59,29 @@ struct DotOpConverter : public OpRewritePattern<DotOp> {
     if ((lhs_type.getRank() != 2) || (rhs_type.getRank() != 2)) {
       return failure();
     }
-    SmallVector<Value, 4> lhs_indices, rhs_indices, result_indices;
-    const auto& loc = op.getLoc();
 
-    // Create the canonical ijk form of matmul.
-    auto forOp = rewriter.create<AffineForOp>(loc, 0, shape_lhs[0]);
-    lhs_indices.push_back(forOp.getInductionVar());
-    result_indices.push_back(forOp.getInductionVar());
+    LogicalResult map_status = success();
+    auto body_builder = [&](OpBuilder& builder, Location loc, ValueRange ivs) {
+      SmallVector<Value, 2> lhs_indices{ivs[0], ivs[2]},
+          rhs_indices{ivs[2], ivs[1]}, result_indices{ivs[0], ivs[1]};
 
-    rewriter.setInsertionPointToStart(forOp.getBody());
-    forOp = rewriter.create<AffineForOp>(loc, 0, shape_rhs.back());
-    result_indices.push_back(forOp.getInductionVar());
-    rhs_indices.resize(2);
-    rhs_indices[1] = forOp.getInductionVar();
+      auto l = builder.create<AffineLoadOp>(loc, lhs, lhs_indices);
+      auto r = builder.create<AffineLoadOp>(loc, rhs, rhs_indices);
+      auto result =
+          rewriter.create<AffineLoadOp>(loc, op.output(), result_indices);
+      Value op_result = xla_lhlo::XlaOpToStdScalarOp::map<DotOp>(
+          op, element_type, {l, r, result}, &builder);
+      map_status = success(op_result != nullptr);
+      if (failed(map_status)) return;
+      builder.create<AffineStoreOp>(loc, op_result, op.output(),
+                                    result_indices);
+    };
 
-    rewriter.setInsertionPointToStart(forOp.getBody());
-    forOp = rewriter.create<AffineForOp>(loc, 0, shape_rhs.front());
-    lhs_indices.push_back(forOp.getInductionVar());
-    rhs_indices[0] = forOp.getInductionVar();
+    BuildBoundedAffineLoopNest(rewriter, op.getLoc(),
+                               {shape_lhs[0], shape_rhs[1], shape_rhs[0]},
+                               body_builder);
+    if (failed(map_status)) return failure();
 
-    // Construct the innermost loop body.
-    rewriter.setInsertionPointToStart(forOp.getBody());
-    auto l = rewriter.create<AffineLoadOp>(loc, lhs, lhs_indices);
-    auto r = rewriter.create<AffineLoadOp>(loc, rhs, rhs_indices);
-    auto result =
-        rewriter.create<AffineLoadOp>(loc, op.output(), result_indices);
-    Value op_result = xla_lhlo::XlaOpToStdScalarOp::map<DotOp>(
-        op, element_type, {l, r, result}, &rewriter);
-    if (op_result == nullptr) {
-      return failure();
-    }
-    rewriter.create<AffineStoreOp>(loc, op_result, op.output(), result_indices);
     rewriter.eraseOp(op);
     return success();
   }
@@ -99,22 +102,22 @@ struct BinaryOpConverter : public OpRewritePattern<LhloOpTy> {
     if (lhs_type.getShape() != rhs_type.getShape()) {
       return failure();
     }
-    const auto& shape = lhs_type.getShape();
-    SmallVector<Value, 4> induction_vars;
-    const auto loc = op.getLoc();
-    for (int i = 0; i < shape.size(); ++i) {
-      auto forOp = rewriter.create<AffineForOp>(loc, 0, shape[i]);
-      induction_vars.push_back(forOp.getInductionVar());
-      rewriter.setInsertionPointToStart(forOp.getBody());
-    }
-    auto l = rewriter.create<AffineLoadOp>(loc, lhs, induction_vars);
-    auto r = rewriter.create<AffineLoadOp>(loc, rhs, induction_vars);
-    Value opResult = xla_lhlo::XlaOpToStdScalarOp::map<LhloOpTy>(
-        op, element_type, {l, r}, &rewriter);
-    if (opResult == nullptr) {
-      return failure();
-    }
-    rewriter.create<AffineStoreOp>(loc, opResult, op.out(), induction_vars);
+
+    LogicalResult map_status = success();
+    auto body_builder = [&](OpBuilder& builder, Location loc,
+                            ValueRange induction_vars) {
+      auto l = builder.create<AffineLoadOp>(loc, lhs, induction_vars);
+      auto r = builder.create<AffineLoadOp>(loc, rhs, induction_vars);
+      Value op_result = xla_lhlo::XlaOpToStdScalarOp::map<LhloOpTy>(
+          op, element_type, {l, r}, &builder);
+      map_status = success(op_result != nullptr);
+      if (failed(map_status)) return;
+      rewriter.create<AffineStoreOp>(loc, op_result, op.out(), induction_vars);
+    };
+
+    BuildBoundedAffineLoopNest(rewriter, op.getLoc(), lhs_type.getShape(),
+                               body_builder);
+    if (failed(map_status)) return failure();
     rewriter.eraseOp(op);
     return success();
   }
diff --git a/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_gpu.cc b/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_gpu.cc
index f0eb3cc1a0f..c23b8b49268 100644
--- a/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_gpu.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_gpu.cc
@@ -177,7 +177,7 @@ struct LhloLegalizeToGpu : public PassWrapper<LhloLegalizeToGpu, FunctionPass> {
     target.addIllegalOp<ReduceOp>();
     auto func = getFunction();
     patterns.insert<LhloReduceToGPULaunchConverter>(func.getContext());
-    if (failed(applyPartialConversion(func, target, patterns, nullptr))) {
+    if (failed(applyPartialConversion(func, target, patterns))) {
       signalPassFailure();
     }
   }
diff --git a/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_llvm.cc b/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_llvm.cc
index 99d2c08aa98..78a77dc3b4d 100644
--- a/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_llvm.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_llvm.cc
@@ -129,7 +129,7 @@ struct DynamicMemRefCastOpConverter
 void PopulateLhloToLLVMConversionPatterns(LLVMTypeConverter *converter,
                                           OwningRewritePatternList *patterns) {
   patterns->insert<DynamicMemRefCastOpConverter, StaticMemRefCastOpConverter>(
-      *converter);
+      *converter, LowerToLLVMOptions());
 }
 
 }  // namespace xla_lhlo
diff --git a/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_llvm_pass.cc b/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_llvm_pass.cc
index 9b809049290..63265c4a7e7 100644
--- a/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_llvm_pass.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_llvm_pass.cc
@@ -43,7 +43,7 @@ class TestLhloToLLVMPass
     target.addLegalOp<ModuleOp, ModuleTerminatorOp>();
     target.addIllegalDialect<XlaLhloDialect>();
 
-    if (failed(applyFullConversion(m, target, patterns, &converter))) {
+    if (failed(applyFullConversion(m, target, patterns))) {
       signalPassFailure();
     }
   }
diff --git a/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_parallel_loops.cc b/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_parallel_loops.cc
index b3112d49103..65962c5b7a5 100644
--- a/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_parallel_loops.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_parallel_loops.cc
@@ -711,7 +711,7 @@ struct LhloLegalizeToParallelLoops
     target.addIllegalOp<xla_lhlo::ReduceOp, xla_lhlo::ReduceWindowOp,
                         xla_lhlo::SelectAndScatterOp>();
 
-    if (failed(applyPartialConversion(func, target, patterns, nullptr))) {
+    if (failed(applyPartialConversion(func, target, patterns))) {
       signalPassFailure();
     }
   }
diff --git a/tensorflow/compiler/mlir/xla/transforms/map_hlo_to_lhlo_op.h b/tensorflow/compiler/mlir/xla/transforms/map_hlo_to_lhlo_op.h
index 4b9397795a1..8d5f27474a5 100644
--- a/tensorflow/compiler/mlir/xla/transforms/map_hlo_to_lhlo_op.h
+++ b/tensorflow/compiler/mlir/xla/transforms/map_hlo_to_lhlo_op.h
@@ -61,6 +61,7 @@ MAP_HLO_TO_LHLO(MulOp);
 MAP_HLO_TO_LHLO(NegOp);
 MAP_HLO_TO_LHLO(RealOp);
 MAP_HLO_TO_LHLO(ReduceOp);
+MAP_HLO_TO_LHLO(ReshapeOp);
 MAP_HLO_TO_LHLO(RemOp);
 MAP_HLO_TO_LHLO(RsqrtOp);
 MAP_HLO_TO_LHLO(SelectOp);
diff --git a/tensorflow/compiler/mlir/xla/transforms/passes.h b/tensorflow/compiler/mlir/xla/transforms/passes.h
index a2af8124786..3db0bc3b474 100644
--- a/tensorflow/compiler/mlir/xla/transforms/passes.h
+++ b/tensorflow/compiler/mlir/xla/transforms/passes.h
@@ -115,6 +115,13 @@ std::unique_ptr<Pass> createLhloCopyRemovalPass();
 std::unique_ptr<OperationPass<FuncOp>> createLegalizeLhloToParallelLoopsPass();
 
 }  // namespace xla_lhlo
+
+namespace xla {
+
+/// Lowers the standard TanhOp to an approximation that does not use intrinsics.
+std::unique_ptr<OperationPass<FuncOp>> createLegalizeTanhToApproximationPass();
+
+}  // namespace xla
 }  // namespace mlir
 
 #endif  // TENSORFLOW_COMPILER_MLIR_XLA_TRANSFORMS_PASSES_H_
diff --git a/tensorflow/compiler/mlir/xla/transforms/rewriters.h b/tensorflow/compiler/mlir/xla/transforms/rewriters.h
index 59347198fe4..7303b87be75 100644
--- a/tensorflow/compiler/mlir/xla/transforms/rewriters.h
+++ b/tensorflow/compiler/mlir/xla/transforms/rewriters.h
@@ -91,6 +91,14 @@ void PopulateLegalizeChloToHloPatterns(MLIRContext *context,
 
 }  // namespace xla_chlo
 
+namespace xla {
+
+// Populates a pattern that translates the standard TanhOp to an approximation
+// that does not use intrinsics.
+void PopulateTanhToApproximationPatterns(MLIRContext *context,
+                                         OwningRewritePatternList *patterns);
+
+}  // namespace xla
 }  // namespace mlir
 
 #endif  // TENSORFLOW_COMPILER_MLIR_XLA_TRANSFORMS_REWRITERS_H_
diff --git a/tensorflow/compiler/mlir/xla/transforms/xla_hlo_fusion.cc b/tensorflow/compiler/mlir/xla/transforms/xla_hlo_fusion.cc
index c4eb0e143d2..5d3eda0bea5 100644
--- a/tensorflow/compiler/mlir/xla/transforms/xla_hlo_fusion.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/xla_hlo_fusion.cc
@@ -487,7 +487,7 @@ struct XlaHloFusion : public mlir::PassWrapper<XlaHloFusion, FunctionPass> {
     }
 
     // process each block and do fusion within a block.
-    for (Block& block : func.getBlocks()) {
+    for (Block& block : func) {
       SmallVector<Operation*, 4> op_list;
       for (Operation& op : block) {
         op_list.push_back(&op);
diff --git a/tensorflow/compiler/mlir/xla/transforms/xla_legalize_to_linalg.cc b/tensorflow/compiler/mlir/xla/transforms/xla_legalize_to_linalg.cc
index ad78a01100b..e7bb5df8233 100644
--- a/tensorflow/compiler/mlir/xla/transforms/xla_legalize_to_linalg.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/xla_legalize_to_linalg.cc
@@ -125,32 +125,19 @@ class PointwiseToLinalgConverter : public OpConversionPattern<OpTy> {
       opResultTypes.push_back(shapedType);
     }
 
+    int64_t args_count = bodyArgTypes.size();
+    int64_t results_count = bodyResultTypes.size();
     auto linalgOp = rewriter.create<linalg::GenericOp>(
-        loc, opResultTypes, args,
-        /*inputCount=*/bodyArgTypes.size(),
-        /*outputCount=*/bodyResultTypes.size(), indexing_maps,
-        GetNParallelLoopsAttrs(nloops));
-
-    // Add a block to the region.
-    auto* region = &linalgOp.region();
-    auto* block = rewriter.createBlock(region, region->end());
-    block->addArguments(bodyArgTypes);
-    if (isLHLO) block->addArguments(bodyResultTypes);
-
-    SmallVector<Value, 4> bodyArgs;
-    for (int i = 0, e = bodyArgTypes.size(); i < e; ++i) {
-      bodyArgs.push_back(block->getArgument(i));
-    }
-
-    rewriter.setInsertionPointToEnd(block);
-    // TODO(ravishankarm) : For now use the method in xla_lhlo namespace. That
-    // method needs to be moved out of there.
-    Value opResult = xla_lhlo::XlaOpToStdScalarOp::map<OpTy>(
-        op, bodyResultTypes, bodyArgs, &rewriter);
-    if (!opResult) {
-      return failure();
-    }
-    rewriter.create<linalg::YieldOp>(loc, opResult);
+        loc, opResultTypes, args, args_count, results_count, indexing_maps,
+        GetNParallelLoopsAttrs(nloops),
+        [&](OpBuilder& nestedBuilder, Location nestedLoc, ValueRange args) {
+          // TODO(ravishankarm) : For now use the method in xla_lhlo namespace.
+          // That method needs to be moved out of there.
+          Value opResult = xla_lhlo::XlaOpToStdScalarOp::map<OpTy>(
+              op, bodyResultTypes,
+              llvm::to_vector<2>(args.take_front(args_count)), &rewriter);
+          nestedBuilder.create<linalg::YieldOp>(loc, opResult);
+        });
     rewriter.replaceOp(op, linalgOp.getOperation()->getResults());
     return success();
   }
@@ -301,27 +288,20 @@ class DataMovementOpConverter : public OpConversionPattern<OpTy> {
       OpTy op, ArrayRef<Value> args,
       ConversionPatternRewriter& rewriter) const final {
     if (!verifyXLAOpBufferOrTensorSemantics<isLHLO>(op)) return failure();
-    auto operandType = op.operand().getType().template cast<ShapedType>();
     auto resultType = getXLAOpResultType<isLHLO>(op);
 
     SmallVector<AffineMap, 2> indexing_maps =
         Derived::getIndexingMaps(op, &rewriter);
     if (indexing_maps.empty()) return failure();
 
-    OpBuilder::InsertionGuard linalgOpGuard(rewriter);
     auto nloops = resultType.getRank();
     auto loc = op.getLoc();
     auto linalgOp = rewriter.create<linalg::GenericOp>(
         loc, isLHLO ? ArrayRef<Type>{} : resultType, args, /*inputCount=*/1,
-        /*outputCount=*/1, indexing_maps, GetNParallelLoopsAttrs(nloops));
-
-    auto* region = &linalgOp.region();
-    auto* block = rewriter.createBlock(region, region->end());
-    block->addArguments(operandType.getElementType());
-    if (isLHLO) block->addArgument(resultType.getElementType());
-
-    rewriter.setInsertionPointToEnd(block);
-    rewriter.create<linalg::YieldOp>(loc, block->getArgument(0));
+        /*outputCount=*/1, indexing_maps, GetNParallelLoopsAttrs(nloops),
+        [&](OpBuilder& nestedBuilder, Location nestedLoc, ValueRange args) {
+          nestedBuilder.create<linalg::YieldOp>(loc, *args.begin());
+        });
 
     rewriter.replaceOp(op, linalgOp.getOperation()->getResults());
     return success();
@@ -437,36 +417,26 @@ class LhloBroadcastInDimConverter
       Value zero = rewriter.create<ConstantIndexOp>(loc, 0);
       Value val =
           rewriter.create<LoadOp>(loc, operand, llvm::makeArrayRef({zero}));
-      auto linalgOp = rewriter.create<linalg::GenericOp>(
+      rewriter.create<linalg::GenericOp>(
           loc, llvm::None, llvm::makeArrayRef(operand_adaptor.output()),
           /*inputCount=*/0, /*outputCount=*/1,
           llvm::makeArrayRef(rewriter.getMultiDimIdentityMap(nloops)),
-          GetNParallelLoopsAttrs(nloops));
+          GetNParallelLoopsAttrs(nloops),
+          [&](OpBuilder& nestedBuilder, Location nestedLoc, ValueRange args) {
+            nestedBuilder.create<linalg::YieldOp>(loc, val);
+          });
 
-      auto* region = &linalgOp.region();
-      auto* block = rewriter.createBlock(region, region->end());
-      block->addArgument(result_type.getElementType());
-
-      rewriter.setInsertionPointToEnd(block);
-      rewriter.create<linalg::YieldOp>(loc, val);
     } else {
       auto indexing_maps = getIndexingMaps(op, broadcast_dims, result_shape,
                                            operand_type, &rewriter);
-
-      OpBuilder::InsertionGuard linalgOpGuard(rewriter);
-      auto linalgOp = rewriter.create<linalg::GenericOp>(
+      rewriter.create<linalg::GenericOp>(
           loc, llvm::None,
           llvm::makeArrayRef({operand, operand_adaptor.output()}),
           /*inputCount=*/1, /*outputCount=*/1, indexing_maps,
-          GetNParallelLoopsAttrs(nloops));
-
-      auto* region = &linalgOp.region();
-      auto* block = rewriter.createBlock(region, region->end());
-      block->addArguments(operand_type.getElementType());
-      block->addArgument(result_type.getElementType());
-
-      rewriter.setInsertionPointToEnd(block);
-      rewriter.create<linalg::YieldOp>(loc, block->getArgument(0));
+          GetNParallelLoopsAttrs(nloops),
+          [&](OpBuilder& nestedBuilder, Location nestedLoc, ValueRange args) {
+            nestedBuilder.create<linalg::YieldOp>(loc, *args.begin());
+          });
     }
     rewriter.replaceOp(op, llvm::None);
     return success();
@@ -686,32 +656,26 @@ class IotaConverter : public OpConversionPattern<xla_lhlo::IotaOp> {
     // Construct the indexing maps needed for linalg.generic ops.
     unsigned nloops = resultMemrefType.getRank();
 
-    auto loc = iotaOp.getLoc();
-    auto linalgOp = rewriter.create<linalg::IndexedGenericOp>(
-        loc, ArrayRef<Type>{}, args,
+    rewriter.create<linalg::IndexedGenericOp>(
+        iotaOp.getLoc(), ArrayRef<Type>{}, args,
         0,  // args_in
         1,  // args_out
         llvm::makeArrayRef(rewriter.getMultiDimIdentityMap(nloops)),
-        GetNParallelLoopsAttrs(nloops));
+        GetNParallelLoopsAttrs(nloops),
+        [&](OpBuilder& nestedBuilder, Location nestedLoc, ValueRange ivs,
+            ValueRange args) {
+          Value castOp = nestedBuilder.create<IndexCastOp>(
+              nestedLoc, ivs[iotaOp.iota_dimension().getZExtValue()],
+              nestedBuilder.getIntegerType(
+                  resultElementType.getIntOrFloatBitWidth()));
+          if (resultElementType.isa<FloatType>()) {
+            castOp = nestedBuilder.create<SIToFPOp>(nestedLoc, castOp,
+                                                    resultElementType);
+          }
+          nestedBuilder.create<linalg::YieldOp>(nestedLoc, castOp);
+        });
 
-    // Add a block to the region.
-    auto* region = &linalgOp.region();
-    auto* block = rewriter.createBlock(region, region->end());
-    for (unsigned i = 0; i < nloops; ++i) {
-      block->addArgument(rewriter.getIndexType());
-    }
-    block->addArguments(llvm::makeArrayRef(resultElementType));
-
-    rewriter.setInsertionPointToEnd(block);
-    Operation* castOp = rewriter.create<IndexCastOp>(
-        loc, block->getArgument(iotaOp.iota_dimension().getZExtValue()),
-        rewriter.getIntegerType(resultElementType.getIntOrFloatBitWidth()));
-    if (resultElementType.isa<FloatType>()) {
-      castOp = rewriter.create<SIToFPOp>(loc, castOp->getResult(0),
-                                         resultElementType);
-    }
-    rewriter.create<linalg::YieldOp>(loc, castOp->getResult(0));
-    rewriter.eraseOp(iotaOp);
+    rewriter.replaceOp(iotaOp, llvm::None);
     return success();
   }
 };
diff --git a/tensorflow/compiler/mlir/lite/tests/tf_device_index_selector.mlir b/tensorflow/compiler/tensorflow/tests/tf_device_index_selector.mlir
similarity index 94%
rename from tensorflow/compiler/mlir/lite/tests/tf_device_index_selector.mlir
rename to tensorflow/compiler/tensorflow/tests/tf_device_index_selector.mlir
index 1ac7f30d644..7fc2b210f91 100644
--- a/tensorflow/compiler/mlir/lite/tests/tf_device_index_selector.mlir
+++ b/tensorflow/compiler/tensorflow/tests/tf_device_index_selector.mlir
@@ -1,6 +1,6 @@
 // Test DeviceIndex selector.
 
-// RUN: tf-opt --tfl-device-index-selector %s | FileCheck %s
+// RUN: tf-opt --tf-device-index-selector %s | FileCheck %s
 
 // CHECK-LABEL: func @select
 func @select(%arg0: tensor<f32>, %arg1: tensor<f32>) -> (tensor<i32>, tensor<f32>) {
diff --git a/tensorflow/compiler/tests/BUILD b/tensorflow/compiler/tests/BUILD
index b574622efce..42353451408 100644
--- a/tensorflow/compiler/tests/BUILD
+++ b/tensorflow/compiler/tests/BUILD
@@ -770,6 +770,7 @@ tf_xla_py_test(
     size = "small",
     timeout = "long",
     srcs = ["image_ops_test.py"],
+    enable_mlir_bridge = True,
     python_version = "PY3",
     shard_count = 10,
     tags = [
@@ -1452,6 +1453,26 @@ tf_xla_py_test(
     ],
 )
 
+tf_xla_py_test(
+    name = "case_test",
+    size = "small",
+    srcs = ["case_test.py"],
+    disabled_backends = ["cpu_ondemand"],
+    python_version = "PY3",
+    tags = [
+        "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
+    ],
+    use_xla_device = False,  # Uses tf.function(experimental_compile=True)
+    deps = [
+        ":xla_test",
+        "//tensorflow/compiler/tf2xla/python:xla",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:training",
+    ],
+)
+
 tf_xla_py_test(
     name = "gather_test",
     size = "medium",
diff --git a/tensorflow/compiler/tests/case_test.py b/tensorflow/compiler/tests/case_test.py
new file mode 100644
index 00000000000..3b2dff537da
--- /dev/null
+++ b/tensorflow/compiler/tests/case_test.py
@@ -0,0 +1,87 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for while loops in XLA."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.compiler.tests import xla_test
+from tensorflow.python.eager import def_function
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import image_ops
+from tensorflow.python.ops import io_ops
+from tensorflow.python.platform import test
+
+
+class CaseTest(xla_test.XLATestCase):
+
+  def testCaseBasic(self):
+
+    @def_function.function(experimental_compile=True)
+    def switch_case_test(branch_index):
+
+      def f1():
+        return array_ops.constant(17)
+
+      def f2():
+        return array_ops.constant(31)
+
+      def f3():
+        return array_ops.constant(-1)
+
+      return control_flow_ops.switch_case(
+          branch_index, branch_fns={
+              0: f1,
+              1: f2
+          }, default=f3)
+
+    with ops.device(self.device):
+      self.assertEqual(switch_case_test(array_ops.constant(0)).numpy(), 17)
+      self.assertEqual(switch_case_test(array_ops.constant(1)).numpy(), 31)
+      self.assertEqual(switch_case_test(array_ops.constant(2)).numpy(), -1)
+      self.assertEqual(switch_case_test(array_ops.constant(3)).numpy(), -1)
+
+  def testBranchIsPruned(self):
+
+    @def_function.function(experimental_compile=True)
+    def switch_case_test():
+      branch_index = array_ops.constant(0)
+
+      def f1():
+        return array_ops.constant(17)
+
+      def f2():
+        # Some operations that XLA cannot compile.
+        image_ops.decode_image(io_ops.read_file('/tmp/bmp'))
+        return array_ops.constant(31)
+
+      # This tests that we do not try to compile all branches if the branch
+      # index in trivially constant.
+      return control_flow_ops.switch_case(
+          branch_index, branch_fns={
+              0: f1,
+              1: f2
+          }, default=f2)
+
+    with ops.device(self.device):
+      self.assertEqual(switch_case_test().numpy(), 17)
+
+
+if __name__ == '__main__':
+  ops.enable_eager_execution()
+  test.main()
diff --git a/tensorflow/compiler/tf2tensorrt/BUILD b/tensorflow/compiler/tf2tensorrt/BUILD
index 4a8599e29f6..368cb5af2ed 100644
--- a/tensorflow/compiler/tf2tensorrt/BUILD
+++ b/tensorflow/compiler/tf2tensorrt/BUILD
@@ -79,6 +79,15 @@ tf_cuda_cc_test(
     ]),
 )
 
+cc_library(
+    name = "common_utils",
+    hdrs = ["common/utils.h"],
+    copts = tf_copts(),
+    deps = [
+        "//tensorflow/core/platform:logging",
+    ] + if_tensorrt([":tensorrt_lib"]),
+)
+
 cc_library(
     name = "trt_op_kernels",
     srcs = [
@@ -95,6 +104,7 @@ cc_library(
         ":trt_plugins",
         ":trt_resources",
         ":utils",
+        ":common_utils",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "@local_config_cuda//cuda:cuda_headers",
@@ -240,6 +250,7 @@ tf_cuda_library(
     hdrs = ["utils/trt_logger.h"],
     visibility = ["//visibility:public"],
     deps = [
+        ":common_utils",
         ":logger_registry",
         "//tensorflow/core:lib_proto_parsing",
     ] + if_tensorrt([":tensorrt_lib"]),
@@ -375,6 +386,7 @@ tf_cuda_library(
         "convert/trt_optimization_pass.h",
     ],
     deps = [
+        ":common_utils",
         ":logger_registry",
         ":segment",
         ":trt_allocator",
@@ -488,6 +500,7 @@ cc_library(
     ],
     copts = tf_copts(),
     deps = [
+        ":common_utils",
         "//tensorflow/core:graph",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
@@ -575,6 +588,7 @@ cc_library(
     hdrs = ["utils/py_utils.h"],
     copts = tf_copts(),
     deps = if_tensorrt([
+        ":common_utils",
         ":tensorrt_lib",
         "//tensorflow/stream_executor/platform:dso_loader",
     ]),
diff --git a/tensorflow/compiler/tf2tensorrt/common/utils.h b/tensorflow/compiler/tf2tensorrt/common/utils.h
new file mode 100644
index 00000000000..b428733ecd4
--- /dev/null
+++ b/tensorflow/compiler/tf2tensorrt/common/utils.h
@@ -0,0 +1,33 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_TF2TENSORRT_COMMON_UTILS_H_
+#define TENSORFLOW_COMPILER_TF2TENSORRT_COMMON_UTILS_H_
+
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
+
+#include "tensorflow/core/platform/logging.h"
+
+namespace tensorflow {
+namespace tensorrt {
+
+#define LOG_WARNING_WITH_PREFIX LOG(WARNING) << "TF-TRT Warning: "
+
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
+
+#endif  // TENSORFLOW_COMPILER_TF2TENSORRT_COMMON_UTILS_H_
diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc
index 414d27477bc..5429aaf3362 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/strings/str_cat.h"
+#include "tensorflow/compiler/tf2tensorrt/common/utils.h"
 #include "tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h"
 #include "tensorflow/compiler/tf2tensorrt/convert/logger_registry.h"
 #include "tensorflow/compiler/tf2tensorrt/convert/utils.h"
@@ -52,8 +53,7 @@ limitations under the License.
 #include "tensorflow/core/util/device_name_utils.h"
 #include "tensorflow/tools/graph_transforms/transform_utils.h"
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
 #include "third_party/gpus/cuda/include/cuda_runtime_api.h"
 #include "third_party/tensorrt/NvInfer.h"
 namespace tensorflow {
@@ -276,8 +276,9 @@ Status GetEngineInfo(const Graph* g,
   if (segment_devices.size() == 1) {
     info->device = *segment_devices.begin();
   } else if (segment_devices.size() > 1) {
-    LOG(WARNING) << "Detected multiple (" << segment_devices.size()
-                 << ") devices for the segment. Picking first one to continue.";
+    LOG_WARNING_WITH_PREFIX
+        << "Detected multiple (" << segment_devices.size()
+        << ") devices for the segment. Picking first one to continue.";
     info->device = *segment_devices.begin();
   } else {
     TfGpuId tf_gpu_id;
@@ -663,7 +664,7 @@ std::pair<int, Allocator*> GetDeviceAndAllocator(const ConversionParams& params,
       StrAppend(&msg, engine.device, "': ");
       for (auto d : devices) StrAppend(&msg, d->name(), ", ");
       StrAppend(&msg, ". Will get the allocator from first one.");
-      LOG(WARNING) << msg;
+      LOG_WARNING_WITH_PREFIX << msg;
     }
     AllocatorAttributes alloc_attr;
     cuda_device_id = devices[0]->tensorflow_gpu_device_info()->gpu_id;
@@ -671,8 +672,8 @@ std::pair<int, Allocator*> GetDeviceAndAllocator(const ConversionParams& params,
     VLOG(1) << "Using allocator " << dev_allocator->Name()
             << " and cuda_device_id " << cuda_device_id;
   } else {
-    LOG(WARNING) << "Cluster is set but device '" << engine.device
-                 << "' is not found in the cluster";
+    LOG_WARNING_WITH_PREFIX << "Cluster is set but device '" << engine.device
+                            << "' is not found in the cluster";
   }
   return std::make_pair(cuda_device_id, dev_allocator);
 }
@@ -770,8 +771,8 @@ Status ConvertAfterShapes(const ConversionParams& params) {
     Status status = GetEngineInfo(&graph, static_graph_properties, curr_segment,
                                   node_map, reverse_topo_order, &curr_engine);
     if (!status.ok()) {
-      LOG(WARNING) << "Failed to get engine info for segment " << t << ": "
-                   << status;
+      LOG_WARNING_WITH_PREFIX << "Failed to get engine info for segment " << t
+                              << ": " << status;
       continue;
     }
     curr_engine.precision_mode = params.precision_mode;
@@ -784,8 +785,9 @@ Status ConvertAfterShapes(const ConversionParams& params) {
                                             &graph, curr_engine.engine_name);
 
     if (!status.ok()) {
-      LOG(WARNING) << "Failed to register segment graphdef to the library " << t
-                   << ": " << status;
+      LOG_WARNING_WITH_PREFIX
+          << "Failed to register segment graphdef to the library " << t << ": "
+          << status;
       continue;
     }
 
@@ -836,7 +838,8 @@ Status ConvertAfterShapes(const ConversionParams& params) {
       alloc.reset(new TRTDeviceAllocator(device_alloc.second));
     } else {
       // Setting allocator as nullptr should get revert to the cudamalloc
-      LOG(WARNING) << "Can't identify the cuda device. Running on device 0 ";
+      LOG_WARNING_WITH_PREFIX
+          << "Can't identify the cuda device. Running on device 0 ";
     }
     cudaSetDevice(cuda_device_id);
     auto status =
@@ -850,9 +853,9 @@ Status ConvertAfterShapes(const ConversionParams& params) {
       LOG(INFO) << "Replaced " << msg << ".";
     } else {
       // Graph is not modified.
-      LOG(WARNING) << "Cannot replace " << msg
-                   << " reason: " << status.error_message()
-                   << " (keeping original segment).";
+      LOG_WARNING_WITH_PREFIX << "Cannot replace " << msg
+                              << " reason: " << status.error_message()
+                              << " (keeping original segment).";
     }
     if (VLOG_IS_ON(1)) {
       msg = "Segment consists of nodes: ";
@@ -880,5 +883,4 @@ Status ConvertAfterShapes(const ConversionParams& params) {
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_TENSORRT
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_graph.h b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.h
index 53ab84a6fa9..d3897e864fa 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/convert_graph.h
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.h
@@ -24,8 +24,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/types.h"
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
 
 namespace tensorflow {
 namespace tensorrt {
@@ -66,7 +65,6 @@ Status RegisterGraphToFunctionLibrary(const GraphDef& segment_graph_def,
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_TENSORRT
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
 
 #endif  // TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_CONVERT_GRAPH_H_
diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_graph_test.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_graph_test.cc
index a1f523d6bfa..54fb1d56441 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/convert_graph_test.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_graph_test.cc
@@ -34,8 +34,7 @@ limitations under the License.
 #include "tensorflow/core/protobuf/config.pb.h"  // NOLINT
 #include "tensorflow/core/public/session.h"
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
 
 namespace tensorflow {
 namespace tensorrt {
@@ -231,5 +230,4 @@ TEST_F(ConvertAfterShapesTest, DirectlyConnectedEngines) {
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_TENSORRT
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc
index 20ee5ffd8f8..2ec616ba621 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc
@@ -31,6 +31,7 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/string_view.h"
+#include "tensorflow/compiler/tf2tensorrt/common/utils.h"
 #include "tensorflow/compiler/tf2tensorrt/convert/utils.h"
 #include "tensorflow/compiler/tf2tensorrt/utils/trt_logger.h"
 #include "tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles.h"
@@ -58,8 +59,7 @@ limitations under the License.
 #include "tensorflow/core/util/env_var.h"
 #include "tensorflow/core/util/strided_slice_op.h"
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
 #include "third_party/tensorrt/NvInfer.h"
 #include "third_party/tensorrt/NvInferPlugin.h"
 
@@ -1214,15 +1214,16 @@ static void InitializeTrtPlugins(nvinfer1::ILogger* trt_logger) {
   nvinfer1::IPluginCreator* const* trt_plugin_creator_list =
       getPluginRegistry()->getPluginCreatorList(&num_trt_plugins);
   if (!trt_plugin_creator_list) {
-    LOG(WARNING) << "Can not find any TensorRT plugins in registry.";
+    LOG_WARNING_WITH_PREFIX << "Can not find any TensorRT plugins in registry.";
   } else {
     VLOG(1) << "Found the following " << num_trt_plugins
             << " TensorRT plugins in registry:";
     for (int i = 0; i < num_trt_plugins; ++i) {
       if (!trt_plugin_creator_list[i]) {
-        LOG(WARNING) << "TensorRT plugin at index " << i
-                     << " is not accessible (null pointer returned by "
-                        "getPluginCreatorList for this plugin)";
+        LOG_WARNING_WITH_PREFIX
+            << "TensorRT plugin at index " << i
+            << " is not accessible (null pointer returned by "
+               "getPluginCreatorList for this plugin)";
       } else {
         VLOG(1) << "  " << trt_plugin_creator_list[i]->getPluginName();
       }
@@ -1827,9 +1828,9 @@ void Converter::MaybeApplyQuantizationRanges() {
       // are tensors which are created internally by TF-TRT. The ranges for
       // these unnamed ITensors are always inferred from user provided ranges,
       // thus there will also be a warning for the range(s) the user missed.
-      LOG(WARNING) << "Quantization range was not found for "
-                   << tensor->getName() << ". "
-                   << "Setting invalid quantization range.";
+      LOG_WARNING_WITH_PREFIX << "Quantization range was not found for "
+                              << tensor->getName() << ". "
+                              << "Setting invalid quantization range.";
       // Set the range to something unusable so the engine will fail if it
       // tries to actually use the tensor's range.
       tensor->setDynamicRange(0, 0);
@@ -4424,8 +4425,13 @@ Status ConvertSquare(OpConverterParams* params) {
   const auto& inputs = params->inputs;
   const auto& node_def = params->node_def;
   TF_RETURN_IF_ERROR(CheckInputsWeights(*params, {{"x", false}}));
+#if IS_TRT_VERSION_GE(6, 0, 1, 0)
+  TF_RETURN_IF_ERROR(AllowDataTypes(
+      *params, {DataType::DT_FLOAT, DataType::DT_HALF, DataType::DT_INT32}));
+#else
   TF_RETURN_IF_ERROR(
       AllowDataTypes(*params, {DataType::DT_FLOAT, DataType::DT_HALF}));
+#endif
   if (params->validation_only) return Status::OK();
 
   // Constant 2 with same rank as input
@@ -4893,10 +4899,11 @@ Status ConvertFusedBatchNorm(OpConverterParams* params) {
     // Trying to use batchnorm in training mode is a very common problem.
     // Because the error message will only be printed in VLOG(1) by the
     // segmenter, we issue a special warning so that users will actually see it.
-    LOG(WARNING) << node_def.op() << " only supports is_training=false. If you "
-                 << "are using Keras, please call "
-                 << "keras.backend.set_learning_phase(0) before constructing "
-                 << "your model. At " << node_def.name();
+    LOG_WARNING_WITH_PREFIX
+        << node_def.op() << " only supports is_training=false. If you "
+        << "are using Keras, please call "
+        << "keras.backend.set_learning_phase(0) before constructing "
+        << "your model. At " << node_def.name();
     return errors::Unimplemented(node_def.op(),
                                  " only supports is_training=false, at ",
                                  node_def.name());
@@ -6034,7 +6041,7 @@ Status ConvertGraphDefToEngine(
         const string error_message =
             StrCat("Validation failed for ", node_name, " and input slot ",
                    slot_number, ": ", status.error_message());
-        LOG(WARNING) << error_message;
+        LOG_WARNING_WITH_PREFIX << error_message;
         return Status(status.code(), error_message);
       }
       VLOG(2) << "Adding engine input tensor " << node_name << " with shape "
@@ -6250,5 +6257,4 @@ bool OutputEdgeValidator::operator()(const Edge* out_edge) const {
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_TENSORRT
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h
index 7a1276c645c..a621735fad1 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h
@@ -33,8 +33,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/stream_executor/lib/statusor.h"
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
 #include "third_party/tensorrt/NvInfer.h"
 
 namespace tensorflow {
@@ -694,7 +693,6 @@ BinaryOperationMap();
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_TENSORRT
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
 
 #endif  // TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_CONVERT_NODES_H_
diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc
index 450831910f6..53ec9ee7ada 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc
@@ -21,8 +21,7 @@ limitations under the License.
 #include <unordered_map>
 #include <vector>
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
@@ -1811,7 +1810,9 @@ class ParameterizedOpConverterTestBase
     const int batch_size = input_data_[0].tensor.shape().dim_size(0);
     Status stat =
         OpConverterTest::BuildAndRun(input_data_, &output_data, batch_size);
-    ASSERT_EQ(expected_runtime_status, stat);
+    ASSERT_EQ(expected_runtime_status.ok(), stat.ok())
+        << "expected status: " << expected_runtime_status
+        << ", actual status: " << stat;
     if (expected_runtime_status.ok() && stat.ok()) {
       for (int i = 0; i < n_output; i++) {
         // Check the shape of the actual output tensors
@@ -2754,58 +2755,40 @@ TEST_F(OpConverterTest, ConvertQuantize) {
   }
 }
 
-template <DataType dtype>
-void TestConvertSquare(OpConverterTest* test) {
-  test->Reset();
-  typedef typename EnumToDataType<dtype>::Type CType;
-
-  Scope s = Scope::NewRootScope();
-  auto input = ops::Placeholder(s.WithOpName("input"), dtype);
-  auto square = ops::Square(s.WithOpName("my_square"), input);
-  NodeDef node_def = square.operation.node()->def();
-
-  test->AddTestTensor("input", {1, 20}, /*batch_size=*/1,
-                      TfDataTypeToTrt(dtype));
-  test->RunValidationAndConversion(node_def);
-  TRT_TensorOrWeights output;
-  TF_EXPECT_OK(test->GetTensorOrWeights("my_square", &output));
-  ASSERT_TRUE(output.is_tensor());
-  ExpectTrtDimsEqualsArray({1, 20}, output.tensor()->getDimensions());
-
-  const int num_inputs = 20;
-  std::vector<CType> inputs(num_inputs);
-  std::vector<CType> expected_outputs(num_inputs);
-  for (int i = 0; i < num_inputs; ++i) {
-    const CType value = CType(i - 9);
-    inputs[i] = value;
-    expected_outputs[i] = value * value;
-  }
-  const DataVec input_data{{"input", test->AsTensor<CType>(inputs)}};
-  // Engine outputs are converted to FP16 automatically if we set FP16 mode in
-  // the builder.
-  DataVec output_data{{"my_square", test->ConstructTensor<CType>(num_inputs)}};
-  TF_EXPECT_OK(test->BuildAndRun(input_data, &output_data));
-  ExpectArrayNear(expected_outputs, GetSpanForData<CType>(output_data[0]));
-}
-
-TEST_F(OpConverterTest, ConvertSquare) {
+TEST_P(OpConverterTest2, ConvertSquare) {
   {
     // Input is weights, should fail.
     Reset();
     Scope s = Scope::NewRootScope();
-    auto input = ops::Placeholder(s.WithOpName("input"), DT_FLOAT);
+    auto input = ops::Placeholder(s.WithOpName("input"), tf_type);
     auto square = ops::Square(s.WithOpName("my_square"), input);
     NodeDef node_def = square.operation.node()->def();
-    AddTestWeights<float>("input", {1, 2, 3}, {1, 2, 3, 4, -5, 6});
+    AddTestWeights("input", {1, 2, 3}, {1, 2, 3, 4, -5, 6}, tf_type);
     RunValidationAndConversion(
         node_def, error::UNIMPLEMENTED,
         "The input \"x\" for Square must be a tensor, at my_square");
   }
 
-  // OK. Note that kINT32 is not supported by IElementWiseLayer, so we don't
-  // test DT_INT32 type here.
-  TestConvertSquare<DT_FLOAT>(this);
-  TestConvertSquare<DT_HALF>(this);
+  Reset();
+
+  Scope s = Scope::NewRootScope();
+  auto input = ops::Placeholder(s.WithOpName("input"), tf_type);
+  auto square = ops::Square(s.WithOpName("my_square"), input);
+  NodeDef node_def = square.operation.node()->def();
+
+  const int num_inputs = 20;
+  std::vector<float> inputs(num_inputs);
+  std::vector<float> expected_outputs(num_inputs);
+
+  for (int i = 0; i < num_inputs; ++i) {
+    const float value = (i - 9);
+    inputs[i] = value;
+    expected_outputs[i] = value * value;
+  }
+  AddTestTensor("input", {1, 1, 20}, tf_type, inputs);
+
+  TestOpConverter("my_square", node_def, {1, 1, 20}, Status::OK(), Status::OK(),
+                  ArrayFloatNear(expected_outputs, 0));
 }
 
 #if IS_TRT_VERSION_GE(5, 1, 0, 0)
@@ -6359,87 +6342,70 @@ NodeDef GetSquaredDifferenceNodeDef(DataType dtype) {
   return squared_diff.operation.node()->def();
 }
 
-template <DataType dtype>
-void TestConvertSquaredDifference(OpConverterTest* test) {
-  typedef typename EnumToDataType<dtype>::Type CType;
-
-  struct TestParams {
-    std::vector<int> dims_x;
-    std::vector<int> dims_y;
-    std::vector<CType> value_x;
-    std::vector<CType> value_y;
-    std::vector<int> expected_output_dims;
-    std::vector<CType> expected_output;
-  };
-
-  const std::vector<CType> common_input = InitTestVector<CType>(6);
-  std::vector<TestParams> params = {
-      {
-          /*dims_x=*/{1, 2, 3},
-          /*dims_y=*/{1, 2, 3},
-          /*value_x=*/common_input,
-          /*value_y=*/CastTestVector<int, CType>({0, -1, 3, 0, 10, -7}),
-          /*expected_output_dims=*/{1, 2, 3},
-          /*expected_output=*/CastTestVector<int, CType>({0, 4, 1, 9, 36, 144}),
-      },
-      {
-          /*dims_x=*/{1, 2, 3},
-          /*dims_y=*/{1, 1, 3},
-          /*value_x=*/common_input,
-          /*value_y=*/CastTestVector<int, CType>({0, 1, 2}),
-          /*expected_output_dims=*/{1, 2, 3},
-          /*expected_output=*/CastTestVector<int, CType>({0, 0, 0, 9, 9, 9}),
-      },
-  };
-
-  for (int i = 0; i < params.size(); ++i) {
-    test->Reset();
-
-    NodeDef node_def = GetSquaredDifferenceNodeDef(dtype);
-    test->AddTestTensor("x", params[i].dims_x, 1, TfDataTypeToTrt(dtype));
-    test->AddTestTensor("y", params[i].dims_y, 1, TfDataTypeToTrt(dtype));
-    test->RunValidationAndConversion(node_def);
-
-    TRT_TensorOrWeights output;
-    TF_EXPECT_OK(test->GetTensorOrWeights("my_squared_diff", &output));
-    EXPECT_TRUE(output.is_tensor());
-    ExpectTrtDimsEqualsArray(params[i].expected_output_dims,
-                             output.tensor()->getDimensions());
-
-    DataVec input_data{{"x", test->AsTensor<CType>(params[i].value_x)},
-                       {"y", test->AsTensor<CType>(params[i].value_y)}};
-    DataVec output_data{
-        {"my_squared_diff",
-         test->ConstructTensor<CType>(params[i].expected_output.size())}};
-    TF_EXPECT_OK(test->BuildAndRun(input_data, &output_data));
-    EXPECT_THAT(GetSpanForData<CType>(output_data[0]),
-                ElementsAreArray(params[i].expected_output));
-  }
-}
-
-TEST_F(OpConverterTest, ConvertSquaredDifference) {
+TEST_P(OpConverterTest2, ConvertSquaredDifference) {
   {
     // Input is a weight, should fail.
     Reset();
-    NodeDef node_def = GetSquaredDifferenceNodeDef(DT_FLOAT);
+    NodeDef node_def = GetSquaredDifferenceNodeDef(tf_type);
     AddTestWeights<float>("x", {1, 2, 3}, {1, 2, 3, 4, 5, 6});
-    AddTestTensor("y", {1, 2, 3});
+    AddTestTensor("y", {1, 1, 2, 3});
     RunValidationAndConversion(node_def, error::UNIMPLEMENTED,
                                "The input \"x\" for SquaredDifference must be "
                                "a tensor, at my_squared_diff");
   }
-  {
-    // Shapes are not broadcastable, should fail.
-    Reset();
-    NodeDef node_def = GetSquaredDifferenceNodeDef(DT_FLOAT);
-    AddTestTensor("x", {2, 3});
-    AddTestTensor("y", {7, 5});
-    RunValidationAndConversion(node_def, error::INVALID_ARGUMENT,
-                               "Infeasible broadcast scheme");
-  }
 
-  TestConvertSquaredDifference<DT_FLOAT>(this);
-  TestConvertSquaredDifference<DT_HALF>(this);
+  struct TestParams {
+    std::vector<int> dims_x;
+    std::vector<int> dims_y;
+    std::vector<float> value_x;
+    std::vector<float> value_y;
+    std::vector<int> expected_output_dims;
+    std::vector<float> expected_output;
+    Status status;
+    Status runtime_status;
+  };
+
+  const std::vector<float> common_input = InitTestVector<float>(6);
+  std::vector<TestParams> params = {
+      {/*dims_x=*/{1, 2, 3},
+       /*dims_y=*/{1, 7, 5},
+       /*value_x=*/common_input,
+       /*value_y=*/std::vector<float>(7 * 5, 0),
+       /*expected_output_dims=*/{1, 1, 2, 3},
+       /*expected_output=*/common_input,
+       trt_mode == TrtTestMode::kDynamicShape
+           ? Status::OK()
+           : errors::InvalidArgument("Infeasible broadcast scheme"),
+       errors::Internal(
+           "Binding index out of range. This can happen if profile is not set, "
+           "or the network is invalid for the current profile.")},
+      {
+          /*dims_x=*/{1, 1, 2, 3},
+          /*dims_y=*/{1, 1, 2, 3},
+          /*value_x=*/common_input,
+          /*value_y=*/{0, -1, 3, 0, 10, -7},
+          /*expected_output_dims=*/{1, 1, 2, 3},
+          /*expected_output=*/{0, 4, 1, 9, 36, 144},
+      },
+      {
+          /*dims_x=*/{1, 1, 2, 3},
+          /*dims_y=*/{1, 1, 1, 3},
+          /*value_x=*/common_input,
+          /*value_y=*/{0, 1, 2},
+          /*expected_output_dims=*/{1, 1, 2, 3},
+          /*expected_output=*/{0, 0, 0, 9, 9, 9},
+      },
+  };
+
+  for (auto p : params) {
+    Reset();
+    NodeDef node_def = GetSquaredDifferenceNodeDef(tf_type);
+    AddTestTensor("x", p.dims_x, p.value_x);
+    AddTestTensor("y", p.dims_y, p.value_y);
+    TestOpConverter("my_squared_diff", node_def, p.expected_output_dims,
+                    p.status, p.runtime_status,
+                    ElementsAreArray(p.expected_output));
+  }
 }
 
 #if IS_TRT_VERSION_GE(6, 0, 0, 0)
@@ -6669,5 +6635,4 @@ TEST_F(OpConverterTest, ConvertPad) {
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_TENSORRT
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
diff --git a/tensorflow/compiler/tf2tensorrt/convert/logger_registry.cc b/tensorflow/compiler/tf2tensorrt/convert/logger_registry.cc
index 82e68cbb28d..07c9c2f1ea0 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/logger_registry.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/logger_registry.cc
@@ -12,8 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
 
 #include "tensorflow/compiler/tf2tensorrt/convert/logger_registry.h"
 
@@ -58,5 +57,4 @@ LoggerRegistry* GetLoggerRegistry() {
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_TENSORRT
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
diff --git a/tensorflow/compiler/tf2tensorrt/convert/logger_registry.h b/tensorflow/compiler/tf2tensorrt/convert/logger_registry.h
index 45b302742d0..2a265cf7caa 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/logger_registry.h
+++ b/tensorflow/compiler/tf2tensorrt/convert/logger_registry.h
@@ -19,7 +19,8 @@ limitations under the License.
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
+
 #include "third_party/tensorrt/NvInfer.h"
 
 namespace tensorflow {
@@ -53,5 +54,5 @@ class RegisterLogger {
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
 #endif  // TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_LOGGER_REGISTRY_H_
diff --git a/tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.cc b/tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.cc
index 72f4fe5ef9b..1cf98d135cb 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.cc
@@ -28,8 +28,7 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/stacktrace.h"
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
 namespace tensorflow {
 namespace tensorrt {
 namespace convert {
@@ -302,5 +301,4 @@ static VerboseCustomGraphOptimizerRegistrar TRTOptimizationPass_Registrar(
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif
-#endif
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
diff --git a/tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.h b/tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.h
index f79048bb5f6..e0aaa5500ab 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.h
+++ b/tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.h
@@ -23,8 +23,7 @@ limitations under the License.
 #include "tensorflow/core/grappler/optimizers/custom_graph_optimizer.h"
 #include "tensorflow/core/platform/logging.h"
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
 
 namespace tensorflow {
 namespace tensorrt {
@@ -83,6 +82,5 @@ class TRTOptimizationPass : public grappler::CustomGraphOptimizer {
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_CUDA
-#endif  // GOOGLE_TENSORRT
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
 #endif  // TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_TRT_OPTIMIZATION_PASS_H_
diff --git a/tensorflow/compiler/tf2tensorrt/kernels/get_calibration_data_op.cc b/tensorflow/compiler/tf2tensorrt/kernels/get_calibration_data_op.cc
index 3143b06817e..76fb40b9520 100644
--- a/tensorflow/compiler/tf2tensorrt/kernels/get_calibration_data_op.cc
+++ b/tensorflow/compiler/tf2tensorrt/kernels/get_calibration_data_op.cc
@@ -22,8 +22,7 @@ limitations under the License.
 #include "tensorflow/core/framework/resource_mgr.h"
 #include "tensorflow/core/lib/core/refcount.h"
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
 
 namespace tensorflow {
 namespace tensorrt {
@@ -67,5 +66,4 @@ REGISTER_KERNEL_BUILDER(Name("GetCalibrationDataOp").Device(DEVICE_GPU),
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_TENSORRT
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
diff --git a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc
index d9b8e198f4f..1094555a622 100644
--- a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc
+++ b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "absl/strings/ascii.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
+#include "tensorflow/compiler/tf2tensorrt/common/utils.h"
 #include "tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h"
 #include "tensorflow/compiler/tf2tensorrt/convert/utils.h"
 #include "tensorflow/compiler/tf2tensorrt/utils/trt_allocator.h"
@@ -44,10 +45,10 @@ limitations under the License.
 #include "tensorflow/core/platform/stream_executor.h"
 #include "tensorflow/core/platform/thread_annotations.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/env_var.h"
 #include "tensorflow/stream_executor/lib/statusor.h"
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
 #include "third_party/gpus/cuda/include/cuda_runtime_api.h"
 #include "third_party/tensorrt/NvInfer.h"
 
@@ -520,6 +521,17 @@ Status TRTEngineOp::VerifyInputShapes(
   return Status::OK();
 }
 
+static bool AllowEngineNativeSegmentExecution() {
+  bool value;
+  Status status =
+      ReadBoolFromEnvVar("TF_TRT_ALLOW_ENGINE_NATIVE_SEGMENT_EXECUTION",
+                         /*default_value=*/true, &value);
+  if (!status.ok()) {
+    LOG(ERROR) << status;
+  }
+  return value;
+}
+
 void TRTEngineOp::ComputeAsync(OpKernelContext* ctx,
                                AsyncOpKernel::DoneCallback done) {
   auto helper = new AsyncHelper(done);
@@ -604,17 +616,31 @@ void TRTEngineOp::ComputeAsync(OpKernelContext* ctx,
 
   EngineContext* engine_context = status.ValueOrDie().first;
   int trt_context_idx = status.ValueOrDie().second;
+  auto may_execute_native_segment = [&] {
+    if (!AllowEngineNativeSegmentExecution()) {
+      ctx->CtxFailure(
+          errors::Aborted("User disallowed engine native segment execution"));
+      return false;
+    }
+    return true;
+  };
   if (!engine_context->cuda_engine) {
-    VLOG(1) << "Engine retrieval for input shapes: "
-            << TensorShapeUtils::ShapeListString(input_concrete_shapes)
-            << " failed. Running native segment for " << name();
-    ExecuteNativeSegment(ctx, helper);
+    LOG_WARNING_WITH_PREFIX
+        << "Engine retrieval for input shapes: "
+        << TensorShapeUtils::ShapeListString(input_concrete_shapes)
+        << " failed. Running native segment for " << name();
+    if (may_execute_native_segment()) {
+      ExecuteNativeSegment(ctx, helper);
+    }
     return;
   }
   Status stat = ExecuteTrtEngine(ctx, engine_context, trt_context_idx);
   if (!stat.ok()) {
-    LOG(WARNING) << "Failed to execute engine: " << stat
-                 << " Retrying with native segment for " << name();
+    LOG_WARNING_WITH_PREFIX << "Failed to execute engine: " << stat
+                            << " Retrying with native segment for " << name();
+    if (!may_execute_native_segment()) {
+      return;
+    }
     // Release any outputs that are allocated, ExecuteNativeSegment will
     // re-allocate them and fail if they are currently allocated.
     for (int i = 0; i < ctx->num_outputs(); i++) {
@@ -727,9 +753,9 @@ StatusOr<TrtUniquePtrType<nvinfer1::ICudaEngine>> TRTEngineOp::BuildEngine(
       calibrator, &engine, use_calibration, use_implicit_batch_, nullptr,
       &cache_resource->profiles_);
   if (!status.ok()) {
-    LOG(WARNING) << "Engine creation for " << name() << " failed. "
-                 << "The native segment will be used instead. "
-                 << "Reason: " << status;
+    LOG_WARNING_WITH_PREFIX << "Engine creation for " << name() << " failed. "
+                            << "The native segment will be used instead. "
+                            << "Reason: " << status;
     // Store an empty engine in the cache for these input shapes so we don't try
     // to build the same failing engine again.
     cache_resource->cache_.emplace(input_concrete_shapes,
@@ -791,8 +817,9 @@ StatusOr<std::pair<EngineContext*, int>> TRTEngineOp::GetEngine(
               FunctionDefToGraphDef(func_handle_, lib, &segment_graph_def_);
         }
         if (!status.ok()) {
-          LOG(WARNING) << "Getting segment graph for " << name() << " failed. "
-                       << "Reason: " << status;
+          LOG_WARNING_WITH_PREFIX << "Getting segment graph for " << name()
+                                  << " failed. "
+                                  << "Reason: " << status;
         }
       }
       auto result = BuildEngine(input_concrete_shapes, batch_size,
@@ -851,10 +878,11 @@ StatusOr<std::pair<EngineContext*, int>> TRTEngineOp::GetEngine(
   // If cache does not have a compatible engine then create a new engine.
   if (engine_contexts == nullptr) {
     if (!allow_build_at_runtime_) {
-      LOG(WARNING) << "Found no engine in cache matching input shapes. "
-                   << "Not building a new engine because "
-                   << "allow_build_at_runtime=False. "
-                   << "The native segment will be used instead.";
+      LOG_WARNING_WITH_PREFIX
+          << "Found no engine in cache matching input shapes. "
+          << "Not building a new engine because "
+          << "allow_build_at_runtime=False. "
+          << "The native segment will be used instead.";
       // Store an empty engine in the cache for these input shapes so we don't
       // try to build the same failing engine again.
       cache.emplace(input_concrete_shapes, absl::make_unique<EngineContext>());
@@ -980,5 +1008,4 @@ REGISTER_KERNEL_BUILDER(Name("TRTEngineOp").Device(DEVICE_GPU), TRTEngineOp);
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_TENSORRT
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
diff --git a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op_test.cc b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op_test.cc
index a06010de1c7..71193dc24cf 100644
--- a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op_test.cc
+++ b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op_test.cc
@@ -50,8 +50,7 @@ limitations under the License.
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/public/version.h"
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
 
 namespace tensorflow {
 namespace tensorrt {
@@ -306,5 +305,4 @@ TYPED_TEST(TRTEngineOpTest, Basic) {
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_TENSORRT
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
diff --git a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_resource_ops.cc b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_resource_ops.cc
index 2c5821df6ac..3b6e7e91d3b 100644
--- a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_resource_ops.cc
+++ b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_resource_ops.cc
@@ -33,8 +33,7 @@ limitations under the License.
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/thread_annotations.h"
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
 #include "third_party/tensorrt/NvInfer.h"
 
 namespace tensorflow {
@@ -251,5 +250,4 @@ REGISTER_KERNEL_BUILDER(Name("SerializeTRTResource").Device(DEVICE_GPU),
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_TENSORRT
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
diff --git a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_resource_ops_test.cc b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_resource_ops_test.cc
index 4a24160569d..6a073ee24d0 100644
--- a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_resource_ops_test.cc
+++ b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_resource_ops_test.cc
@@ -48,8 +48,7 @@ limitations under the License.
 #include "tensorflow/core/platform/tstring.h"
 #include "tensorflow/core/platform/types.h"
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
 
 namespace tensorflow {
 namespace tensorrt {
@@ -246,5 +245,4 @@ TEST_F(TRTEngineResourceOpsTest, Basic) {
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_TENSORRT
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
diff --git a/tensorflow/compiler/tf2tensorrt/ops/get_calibration_data_op.cc b/tensorflow/compiler/tf2tensorrt/ops/get_calibration_data_op.cc
index 573172b92e6..2af3164c3e2 100644
--- a/tensorflow/compiler/tf2tensorrt/ops/get_calibration_data_op.cc
+++ b/tensorflow/compiler/tf2tensorrt/ops/get_calibration_data_op.cc
@@ -13,8 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
 
 #include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/framework/op.h"
@@ -34,5 +33,4 @@ Returns calibration data for the given resource name
 
 }  // namespace tensorflow
 
-#endif  // GOOGLE_TENSORRT
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
diff --git a/tensorflow/compiler/tf2tensorrt/ops/trt_engine_op.cc b/tensorflow/compiler/tf2tensorrt/ops/trt_engine_op.cc
index bd3c2b299a9..2527fe9b910 100644
--- a/tensorflow/compiler/tf2tensorrt/ops/trt_engine_op.cc
+++ b/tensorflow/compiler/tf2tensorrt/ops/trt_engine_op.cc
@@ -13,8 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
 
 #include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/framework/op.h"
@@ -59,5 +58,4 @@ REGISTER_OP("TRTEngineOp")
     .Attr("static_engine: bool = true");
 }  // namespace tensorflow
 
-#endif  // GOOGLE_TENSORRT
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
diff --git a/tensorflow/compiler/tf2tensorrt/ops/trt_engine_resource_ops.cc b/tensorflow/compiler/tf2tensorrt/ops/trt_engine_resource_ops.cc
index 01911de66ec..3141092de03 100644
--- a/tensorflow/compiler/tf2tensorrt/ops/trt_engine_resource_ops.cc
+++ b/tensorflow/compiler/tf2tensorrt/ops/trt_engine_resource_ops.cc
@@ -13,8 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
 
 #include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/framework/op.h"
@@ -46,5 +45,4 @@ REGISTER_OP("SerializeTRTResource")
 
 }  // namespace tensorflow
 
-#endif  // GOOGLE_TENSORRT
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
diff --git a/tensorflow/compiler/tf2tensorrt/plugin/plugin_cast.cu.cc b/tensorflow/compiler/tf2tensorrt/plugin/plugin_cast.cu.cc
index 4c0d8b0392a..141a7d1f462 100644
--- a/tensorflow/compiler/tf2tensorrt/plugin/plugin_cast.cu.cc
+++ b/tensorflow/compiler/tf2tensorrt/plugin/plugin_cast.cu.cc
@@ -17,8 +17,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2tensorrt/plugin/trt_plugin.h"
 #include "tensorflow/core/platform/logging.h"
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
 #define EIGEN_USE_GPU  // For definition of Eigen::GpuDevice.
 #include "third_party/gpus/cuda/include/cuda_runtime_api.h"
 #include "tensorflow/core/util/gpu_kernel_helper.h"
@@ -234,5 +233,4 @@ REGISTER_TFTRT_PLUGIN(CastPluginCreator);
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_CUDA
-#endif  // GOOGLE_TENSORRT
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
diff --git a/tensorflow/compiler/tf2tensorrt/plugin/trt_plugin.cc b/tensorflow/compiler/tf2tensorrt/plugin/trt_plugin.cc
index 563ce724f43..83d5f9b5965 100644
--- a/tensorflow/compiler/tf2tensorrt/plugin/trt_plugin.cc
+++ b/tensorflow/compiler/tf2tensorrt/plugin/trt_plugin.cc
@@ -17,8 +17,7 @@ limitations under the License.
 
 #include <cstring>
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
 
 namespace tensorflow {
 namespace tensorrt {
@@ -30,5 +29,4 @@ const char* kTfTrtPluginNamespace = "TF";
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_CUDA
-#endif  // GOOGLE_TENSORRT
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
diff --git a/tensorflow/compiler/tf2tensorrt/plugin/trt_plugin.h b/tensorflow/compiler/tf2tensorrt/plugin/trt_plugin.h
index bdb046e6c71..600ac6683da 100644
--- a/tensorflow/compiler/tf2tensorrt/plugin/trt_plugin.h
+++ b/tensorflow/compiler/tf2tensorrt/plugin/trt_plugin.h
@@ -20,8 +20,7 @@ limitations under the License.
 
 #include "tensorflow/core/platform/logging.h"
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
 #include "third_party/tensorrt/NvInfer.h"
 
 namespace tensorflow {
@@ -90,7 +89,6 @@ class TrtPluginRegistrar {
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_TENSORRT
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
 
 #endif  // TENSORFLOW_COMPILER_TF2TENSORRT_PLUGIN_TRT_PLUGIN_H_
diff --git a/tensorflow/compiler/tf2tensorrt/segment/segment.cc b/tensorflow/compiler/tf2tensorrt/segment/segment.cc
index 749335f1b09..d9080b6f69a 100644
--- a/tensorflow/compiler/tf2tensorrt/segment/segment.cc
+++ b/tensorflow/compiler/tf2tensorrt/segment/segment.cc
@@ -22,6 +22,7 @@ limitations under the License.
 
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
+#include "tensorflow/compiler/tf2tensorrt/common/utils.h"
 #include "tensorflow/compiler/tf2tensorrt/segment/union_find.h"
 #include "tensorflow/core/common_runtime/graph_constructor.h"
 #include "tensorflow/core/graph/algorithm.h"
@@ -34,8 +35,7 @@ limitations under the License.
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/util/env_var.h"
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
 
 namespace tensorflow {
 namespace tensorrt {
@@ -748,9 +748,10 @@ Status SegmentGraph(const Graph* tf_graph,
         exclude_node(status.error_message());
       } else if (tftrt_op_blacklist.count(node->tf_node()->type_string())) {
         // WARNING verbosity since the user explicitly requests this behavior.
-        LOG(WARNING) << "Blacklisted as TF-TRT candidate, "
-                     << "(Op type: " << node->tf_node()->type_string() << "), "
-                     << "(Op name: " << node->name() << ")";
+        LOG_WARNING_WITH_PREFIX
+            << "Blacklisted as TF-TRT candidate, "
+            << "(Op type: " << node->tf_node()->type_string() << "), "
+            << "(Op name: " << node->name() << ")";
         exclude_node("Blacklisted with the env var TF_TRT_OP_BLACKLIST");
       } else {
         VLOG(2) << "Accepted as a TF-TRT candidate, "
@@ -1038,7 +1039,7 @@ Status SegmentGraph(const Graph* tf_graph,
       for (const auto& dev : dev_itr->second) {
         StrAppend(&s, dev, ", ");
       }
-      LOG(WARNING) << s;
+      LOG_WARNING_WITH_PREFIX << s;
     }
 
     segments->emplace_back(segment_nodes);
@@ -1060,5 +1061,4 @@ Status SegmentGraph(const Graph* tf_graph,
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_TENSORRT
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
diff --git a/tensorflow/compiler/tf2tensorrt/segment/segment.h b/tensorflow/compiler/tf2tensorrt/segment/segment.h
index 7295c8f0d9d..3f79983cfd2 100644
--- a/tensorflow/compiler/tf2tensorrt/segment/segment.h
+++ b/tensorflow/compiler/tf2tensorrt/segment/segment.h
@@ -25,8 +25,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/types.h"
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
 
 namespace tensorflow {
 namespace tensorrt {
@@ -67,7 +66,6 @@ Status SegmentGraph(const Graph* tf_graph,
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_TENSORRT
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
 
 #endif  // TENSORFLOW_COMPILER_TF2TENSORRT_SEGMENT_SEGMENT_H_
diff --git a/tensorflow/compiler/tf2tensorrt/segment/segment_test.cc b/tensorflow/compiler/tf2tensorrt/segment/segment_test.cc
index 2437481a9c4..f3bc5bfbee6 100644
--- a/tensorflow/compiler/tf2tensorrt/segment/segment_test.cc
+++ b/tensorflow/compiler/tf2tensorrt/segment/segment_test.cc
@@ -26,8 +26,7 @@ limitations under the License.
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/public/session.h"
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
 
 namespace tensorflow {
 namespace tensorrt {
@@ -522,5 +521,4 @@ TEST_F(SegmentTest, IncompatibleBatchSizes) {
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_TENSORRT
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
diff --git a/tensorflow/compiler/tf2tensorrt/segment/union_find.h b/tensorflow/compiler/tf2tensorrt/segment/union_find.h
index 70e83c12fca..b53615ec019 100644
--- a/tensorflow/compiler/tf2tensorrt/segment/union_find.h
+++ b/tensorflow/compiler/tf2tensorrt/segment/union_find.h
@@ -19,8 +19,7 @@ limitations under the License.
 #include "absl/strings/str_format.h"
 #include "absl/types/optional.h"
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
 
 namespace tensorflow {
 namespace tensorrt {
@@ -217,7 +216,6 @@ UnionFind<T>* UnionFind<T>::FindRoot() {
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_TENSORRT
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
 
 #endif  // TENSORFLOW_COMPILER_TF2TENSORRT_SEGMENT_UNION_FIND_H_
diff --git a/tensorflow/compiler/tf2tensorrt/tensorrt_test.cc b/tensorflow/compiler/tf2tensorrt/tensorrt_test.cc
index 510591bfe00..e994d20df33 100644
--- a/tensorflow/compiler/tf2tensorrt/tensorrt_test.cc
+++ b/tensorflow/compiler/tf2tensorrt/tensorrt_test.cc
@@ -18,8 +18,7 @@ limitations under the License.
 #include "tensorflow/core/platform/stream_executor.h"
 #include "tensorflow/core/platform/test.h"
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
 #include "third_party/gpus/cuda/include/cuda.h"
 #include "third_party/gpus/cuda/include/cuda_runtime_api.h"
 #include "third_party/tensorrt/NvInfer.h"
@@ -164,5 +163,4 @@ TEST(TensorrtTest, BasicFunctions) {
 }  // namespace
 }  // namespace tensorflow
 
-#endif  // GOOGLE_TENSORRT
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
diff --git a/tensorflow/compiler/tf2tensorrt/utils/py_utils.cc b/tensorflow/compiler/tf2tensorrt/utils/py_utils.cc
index 885f58cd70c..a8e24aa8983 100644
--- a/tensorflow/compiler/tf2tensorrt/utils/py_utils.cc
+++ b/tensorflow/compiler/tf2tensorrt/utils/py_utils.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2tensorrt/utils/py_utils.h"
 
 #if GOOGLE_CUDA && GOOGLE_TENSORRT
+#include "tensorflow/compiler/tf2tensorrt/common/utils.h"
 #include "tensorflow/stream_executor/platform/dso_loader.h"
 #include "third_party/tensorrt/NvInfer.h"
 #endif
@@ -27,9 +28,10 @@ bool IsGoogleTensorRTEnabled() {
 #if GOOGLE_CUDA && GOOGLE_TENSORRT
   auto handle_or = se::internal::DsoLoader::TryDlopenTensorRTLibraries();
   if (!handle_or.ok()) {
-    LOG(WARNING) << "Cannot dlopen some TensorRT libraries. If you would like "
-                    "to use Nvidia GPU with TensorRT, please make sure the "
-                    "missing libraries mentioned above are installed properly.";
+    LOG_WARNING_WITH_PREFIX
+        << "Cannot dlopen some TensorRT libraries. If you would like "
+           "to use Nvidia GPU with TensorRT, please make sure the "
+           "missing libraries mentioned above are installed properly.";
     return false;
   } else {
     return true;
diff --git a/tensorflow/compiler/tf2tensorrt/utils/trt_allocator.cc b/tensorflow/compiler/tf2tensorrt/utils/trt_allocator.cc
index 617ea7fad5c..d4f3a524577 100644
--- a/tensorflow/compiler/tf2tensorrt/utils/trt_allocator.cc
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_allocator.cc
@@ -17,11 +17,9 @@ limitations under the License.
 
 #include "tensorflow/core/platform/logging.h"
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
 #include "third_party/gpus/cuda/include/cuda_runtime_api.h"
-#endif  // GOOGLE_TENSORRT
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
 
 namespace tensorflow {
 namespace tensorrt {
@@ -52,8 +50,7 @@ void* Align(uint64_t alignment, uint64_t size, void*& ptr, uint64_t& space) {
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
 
 namespace tensorflow {
 namespace tensorrt {
@@ -113,5 +110,4 @@ void TRTDeviceAllocator::free(void* memory) {
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_TENSORRT
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
diff --git a/tensorflow/compiler/tf2tensorrt/utils/trt_allocator.h b/tensorflow/compiler/tf2tensorrt/utils/trt_allocator.h
index 4ab8b52f523..d219a8a14e8 100644
--- a/tensorflow/compiler/tf2tensorrt/utils/trt_allocator.h
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_allocator.h
@@ -20,11 +20,9 @@ limitations under the License.
 
 #include "tensorflow/core/framework/allocator.h"
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
 #include "third_party/tensorrt/NvInfer.h"
-#endif  // GOOGLE_TENSORRT
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
 
 namespace tensorflow {
 namespace tensorrt {
@@ -33,8 +31,7 @@ void* Align(uint64_t alignment, uint64_t size, void*& ptr, uint64_t& space);
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
 
 namespace tensorflow {
 namespace tensorrt {
@@ -69,6 +66,5 @@ class TRTDeviceAllocator : public TRTBaseAllocator {
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_TENSORRT
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
 #endif  // TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_ALLOCATOR_H_
diff --git a/tensorflow/compiler/tf2tensorrt/utils/trt_engine_utils.cc b/tensorflow/compiler/tf2tensorrt/utils/trt_engine_utils.cc
index 213c1732e59..8ccfb8b06f0 100644
--- a/tensorflow/compiler/tf2tensorrt/utils/trt_engine_utils.cc
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_engine_utils.cc
@@ -25,8 +25,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/errors.h"
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
 #include "third_party/tensorrt/NvInfer.h"
 
 namespace tensorflow {
@@ -46,6 +45,14 @@ Status GetTrtBindingShape(const nvinfer1::ICudaEngine* cuda_engine,
     // Get dims from context instead of engine in explicit batch mode because
     // the engine might have dynamic shapes.
     dims = execution_context->getBindingDimensions(binding_index);
+    if (dims.nbDims == -1) {
+      // Invalid dimensions. There can be multiple reasons for this. If we have
+      // incompatible input shapes (network invalid for the current profile)
+      // that can trigger this error.
+      return errors::Internal(
+          "Binding index out of range. This can happen if profile is not set, "
+          "or the network is invalid for the current profile.");
+    }
 #else
     return errors::Internal(
         "Explicit batch mode is only supported with TensorRT 6 and above.");
@@ -249,5 +256,4 @@ Status TrtEnqueue(nvinfer1::IExecutionContext* execution_context,
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_TENSORRT
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
diff --git a/tensorflow/compiler/tf2tensorrt/utils/trt_engine_utils.h b/tensorflow/compiler/tf2tensorrt/utils/trt_engine_utils.h
index a471749877a..1ea4fe28cb4 100644
--- a/tensorflow/compiler/tf2tensorrt/utils/trt_engine_utils.h
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_engine_utils.h
@@ -24,8 +24,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/lib/core/status.h"
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
 #include "third_party/tensorrt/NvInfer.h"
 
 namespace tensorflow {
@@ -91,7 +90,6 @@ Status TrtEnqueue(nvinfer1::IExecutionContext* execution_context,
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_TENSORRT
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
 
 #endif  // TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_ENGINE_UTILS_H_
diff --git a/tensorflow/compiler/tf2tensorrt/utils/trt_int8_calibrator.cc b/tensorflow/compiler/tf2tensorrt/utils/trt_int8_calibrator.cc
index 554c127fa37..24271e352a7 100644
--- a/tensorflow/compiler/tf2tensorrt/utils/trt_int8_calibrator.cc
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_int8_calibrator.cc
@@ -20,8 +20,7 @@ limitations under the License.
 
 #include "tensorflow/core/platform/logging.h"
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
 #include "third_party/gpus/cuda/include/cuda_runtime_api.h"
 
 namespace tensorflow {
@@ -147,5 +146,4 @@ TRTInt8Calibrator::~TRTInt8Calibrator() {
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif
-#endif
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
diff --git a/tensorflow/compiler/tf2tensorrt/utils/trt_int8_calibrator.h b/tensorflow/compiler/tf2tensorrt/utils/trt_int8_calibrator.h
index 06b39716490..4c670e85f52 100644
--- a/tensorflow/compiler/tf2tensorrt/utils/trt_int8_calibrator.h
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_int8_calibrator.h
@@ -22,8 +22,7 @@ limitations under the License.
 #include <utility>
 #include "tensorflow/core/platform/mutex.h"
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
 
 #include "third_party/gpus/cuda/include/cuda_runtime_api.h"
 #include "third_party/tensorrt/NvInfer.h"
@@ -101,6 +100,5 @@ struct TRTInt8Calibrator : public nvinfer1::IInt8EntropyCalibrator {
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif
-#endif
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
 #endif  // TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_INT8_CALIBRATOR_H_
diff --git a/tensorflow/compiler/tf2tensorrt/utils/trt_logger.cc b/tensorflow/compiler/tf2tensorrt/utils/trt_logger.cc
index 6bb6f1f9dd8..e34bf5e7397 100644
--- a/tensorflow/compiler/tf2tensorrt/utils/trt_logger.cc
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_logger.cc
@@ -15,8 +15,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/tf2tensorrt/utils/trt_logger.h"
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
+#include "tensorflow/compiler/tf2tensorrt/common/utils.h"
 #include "tensorflow/compiler/tf2tensorrt/convert/logger_registry.h"
 #include "tensorflow/core/platform/logging.h"
 
@@ -35,7 +35,7 @@ void Logger::log(Severity severity, const char* msg) {
       break;
     }
     case Severity::kWARNING: {
-      LOG(WARNING) << name_ << " " << msg;
+      LOG_WARNING_WITH_PREFIX << name_ << " " << msg;
       break;
     }
     case Severity::kERROR: {
@@ -67,5 +67,4 @@ REGISTER_TENSORRT_LOGGER("DefaultLogger", Logger::GetLogger());
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_CUDA
-#endif  // GOOGLE_TENSORRT
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
diff --git a/tensorflow/compiler/tf2tensorrt/utils/trt_logger.h b/tensorflow/compiler/tf2tensorrt/utils/trt_logger.h
index 2ade1b48f47..ce6552e8fe9 100644
--- a/tensorflow/compiler/tf2tensorrt/utils/trt_logger.h
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_logger.h
@@ -18,8 +18,7 @@ limitations under the License.
 
 #include "tensorflow/core/platform/types.h"
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
 #include "third_party/tensorrt/NvInfer.h"
 
 namespace tensorflow {
@@ -40,7 +39,6 @@ class Logger : public nvinfer1::ILogger {
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_TENSORRT
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
 
 #endif  // TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_LOGGER_H_
diff --git a/tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.cc b/tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.cc
index fbcdaad52c0..ee7e6272372 100644
--- a/tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.cc
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.cc
@@ -23,8 +23,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/platform/mutex.h"
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
 #include "third_party/tensorrt/NvInfer.h"
 
 namespace tensorflow {
@@ -141,5 +140,4 @@ EngineContext* TRTEngineCacheResource::GetEngineContext(const int profile_id) {
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_TENSORRT
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
diff --git a/tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.h b/tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.h
index 8e345254f75..991b9a949e4 100644
--- a/tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.h
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.h
@@ -115,8 +115,7 @@ class LRUCache {
   }
 };
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
 
 struct EngineContext {
   EngineContext() {}  // Creates an empty context.
@@ -223,8 +222,7 @@ class TRTEngineCacheResource : public ResourceBase {
   TrtShapeOptimizationProfile profiles_;
 };
 
-#endif  // GOOGLE_TENSORRT
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
 
 }  // namespace tensorrt
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles.h b/tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles.h
index 40c7f5dcf31..fc688b14139 100644
--- a/tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles.h
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles.h
@@ -29,8 +29,7 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
 
 #include "third_party/tensorrt/NvInfer.h"
 
@@ -173,6 +172,5 @@ class TrtShapeOptimizationProfile {
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_TENSORRT
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
 #endif  // TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_SHAPE_OPTIMIZATION_PROFILES_H_
diff --git a/tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles_test.cc b/tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles_test.cc
index 501810587e0..32c2200fb71 100644
--- a/tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles_test.cc
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles_test.cc
@@ -13,8 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
 
 #include <string.h>
 
@@ -214,5 +213,4 @@ TEST_F(TrtShapeOptimizationProfileTest, Dynamic) {
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_TENSORRT
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
diff --git a/tensorflow/compiler/tf2xla/kernels/BUILD b/tensorflow/compiler/tf2xla/kernels/BUILD
index bfdfe38305b..e072225566d 100644
--- a/tensorflow/compiler/tf2xla/kernels/BUILD
+++ b/tensorflow/compiler/tf2xla/kernels/BUILD
@@ -32,6 +32,7 @@ tf_kernel_library(
         "data_format_ops.cc",
         "depthtospace_op.cc",
         "dequantize_op.cc",
+        "device_index_op.cc",
         "diag_op.cc",
         "dynamic_slice_ops.cc",
         "dynamic_stitch_op.cc",
@@ -316,6 +317,7 @@ tf_kernel_library(
         "//tensorflow/compiler/tf2xla/ops:xla_ops",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client/lib:constants",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
diff --git a/tensorflow/compiler/tf2xla/kernels/case_op.cc b/tensorflow/compiler/tf2xla/kernels/case_op.cc
index 1b15c09f7e3..fbd54f1ef39 100644
--- a/tensorflow/compiler/tf2xla/kernels/case_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/case_op.cc
@@ -21,13 +21,14 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_context.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/compiler/xla/client/lib/constants.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/core/lib/core/errors.h"
 
 namespace tensorflow {
 
 XlaCaseOp::XlaCaseOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
-  OP_REQUIRES_OK(ctx, ctx->GetAttr("branches", &branches_));
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("branches", &unpruned_branches_));
   OP_REQUIRES_OK(ctx, ctx->GetAttr("Tin", &input_types_));
   OP_REQUIRES_OK(ctx, ctx->GetAttr("Tout", &output_types_));
   if (!ctx->GetAttr(kXlaTokenInputNodesAttrName, &token_input_nodes_).ok()) {
@@ -41,12 +42,29 @@ XlaCaseOp::XlaCaseOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
   }
 }
 
+std::pair<std::vector<NameAttrList>, xla::XlaOp>
+XlaCaseOp::GetPrunedBranchesAndIndex(XlaOpKernelContext* ctx) {
+  xla::Literal branch_index_literal;
+  bool branch_index_is_constant =
+      ctx->ConstantInput(0, &branch_index_literal).ok();
+
+  if (!branch_index_is_constant) {
+    return {unpruned_branches_, ctx->Input(0)};
+  }
+
+  int32 branch_index = branch_index_literal.Get<int32>({});
+  if (branch_index < 0 || branch_index >= unpruned_branches_.size()) {
+    branch_index = unpruned_branches_.size() - 1;
+  }
+
+  std::vector<NameAttrList> pruned_branch = {unpruned_branches_[branch_index]};
+  return {pruned_branch, xla::ZerosLike(ctx->Input(0))};
+}
+
 // TODO(b/35949885): There is duplication here with the handling of the
 // while_op/if_op. Refactor the common code out/rework.
 void XlaCaseOp::Compile(XlaOpKernelContext* ctx) {
-  xla::XlaBuilder* b = ctx->builder();
-  int num_branches = branches_.size();
-  OP_REQUIRES(ctx, num_branches >= 1,
+  OP_REQUIRES(ctx, !unpruned_branches_.empty(),
               errors::InvalidArgument("Must provide at least one case branch"));
   OP_REQUIRES(ctx, input_type(0) == DT_INT32,
               errors::InvalidArgument(
@@ -55,6 +73,18 @@ void XlaCaseOp::Compile(XlaOpKernelContext* ctx) {
               errors::InvalidArgument(
                   "branch_index argument must be scalar for XLA compilation"));
 
+  xla::XlaBuilder* b = ctx->builder();
+
+  // We opportunistically prune out branches if the branch index is a
+  // compile-time constant.  This is important in the context of the DeviceIndex
+  // ops (and other such ops that may come later) since we may have a Case with
+  // trivially unselected branches that cannot be compiled into HLO.
+  std::vector<NameAttrList> branches;
+  xla::XlaOp branch_index;
+  std::tie(branches, branch_index) = GetPrunedBranchesAndIndex(ctx);
+
+  int num_branches = branches.size();
+
   VLOG(1) << "Building Case: " << input_types_.size() << " inputs";
 
   std::vector<XlaCompiler::Argument> arguments(input_types_.size());
@@ -94,7 +124,7 @@ void XlaCaseOp::Compile(XlaOpKernelContext* ctx) {
     std::vector<const FunctionBody*> case_bodies(num_branches);
     for (int branch_idx = 0; branch_idx < num_branches; branch_idx++) {
       OP_REQUIRES_OK(ctx, FindMustBeConstNodes(
-                              ctx, branches_[branch_idx],
+                              ctx, branches[branch_idx],
                               &case_branch_must_be_const_nodes[branch_idx],
                               &case_bodies[branch_idx]));
     }
@@ -133,7 +163,7 @@ void XlaCaseOp::Compile(XlaOpKernelContext* ctx) {
   std::vector<XlaCompiler::CompilationResult*> branch_results_p(num_branches);
   for (int j = 0; j < num_branches; ++j) {
     OP_REQUIRES_OK(ctx,
-                   compiler->CompileFunction(options, branches_[j], arguments,
+                   compiler->CompileFunction(options, branches[j], arguments,
                                              &branch_results[j]));
     branch_results_p[j] = &branch_results[j];
   }
@@ -171,7 +201,7 @@ void XlaCaseOp::Compile(XlaOpKernelContext* ctx) {
     for (int j = 0; j < num_branches; ++j) {
       branch_results[j] = {};
       OP_REQUIRES_OK(ctx,
-                     compiler->CompileFunction(options, branches_[j], arguments,
+                     compiler->CompileFunction(options, branches[j], arguments,
                                                &branch_results[j]));
     }
   }
@@ -277,7 +307,7 @@ void XlaCaseOp::Compile(XlaOpKernelContext* ctx) {
   auto input_tuple = xla::Tuple(b, inputs);
 
   xla::XlaOp outputs =
-      xla::Conditional(ctx->Input(0), absl::MakeSpan(result_computations),
+      xla::Conditional(branch_index, absl::MakeSpan(result_computations),
                        std::vector<xla::XlaOp>(num_branches, input_tuple));
   // Sets non-variable outputs.
   for (int i = 0; i < output_types_.size(); ++i) {
diff --git a/tensorflow/compiler/tf2xla/kernels/case_op.h b/tensorflow/compiler/tf2xla/kernels/case_op.h
index 4a61707864e..4d22a3db830 100644
--- a/tensorflow/compiler/tf2xla/kernels/case_op.h
+++ b/tensorflow/compiler/tf2xla/kernels/case_op.h
@@ -50,7 +50,16 @@ class XlaCaseOp : public XlaOpKernel {
  private:
   TF_DISALLOW_COPY_AND_ASSIGN(XlaCaseOp);
 
-  std::vector<NameAttrList> branches_;
+  // If the branch_index input is a constant: prunes out all but the branch
+  // corrresponding to that constant branch index, and returns that branch and
+  // the literal 0 (as the first and second component of the pair).
+  //
+  // If the branch_index input is not a constant: returns unpruned_branches_ and
+  // the branch_index input.
+  std::pair<std::vector<NameAttrList>, xla::XlaOp> GetPrunedBranchesAndIndex(
+      XlaOpKernelContext* ctx);
+
+  std::vector<NameAttrList> unpruned_branches_;
   DataTypeVector input_types_;
   DataTypeVector output_types_;
   bool has_token_input_output_;
diff --git a/tensorflow/compiler/tf2xla/kernels/device_index_op.cc b/tensorflow/compiler/tf2xla/kernels/device_index_op.cc
new file mode 100644
index 00000000000..ff058f92cd7
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/kernels/device_index_op.cc
@@ -0,0 +1,51 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/compiler/tf2xla/xla_helpers.h"
+#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/compiler/xla/client/client_library.h"
+#include "tensorflow/compiler/xla/client/lib/arithmetic.h"
+#include "tensorflow/compiler/xla/client/lib/constants.h"
+#include "tensorflow/compiler/xla/client/lib/math.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/core/framework/kernel_def_builder.h"
+
+namespace tensorflow {
+namespace {
+
+class DeviceIndexOp : public XlaOpKernel {
+ public:
+  explicit DeviceIndexOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("device_names", &device_names_));
+  }
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    // When compiling we are not executing on any physical device, so we return
+    // a sentinel value (size of the list of devices).
+    ctx->SetOutput(
+        0, xla::ConstantR0<int32>(ctx->builder(), device_names_.size()));
+  }
+
+ private:
+  std::vector<string> device_names_;
+};
+
+REGISTER_XLA_OP(Name("DeviceIndex"), DeviceIndexOp);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/strided_slice_op.cc b/tensorflow/compiler/tf2xla/kernels/strided_slice_op.cc
index 2684c982600..784b790767c 100644
--- a/tensorflow/compiler/tf2xla/kernels/strided_slice_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/strided_slice_op.cc
@@ -350,6 +350,30 @@ class StridedSliceGradOp : public XlaOpKernel {
       grad = xla::Rev(grad, dimensions_to_reverse);
     }
     grad = xla::Pad(grad, zero, padding_config);
+
+    xla::XlaOp dynamic_shape = ctx->Input(0);
+    xla::Shape grad_shape = ctx->builder()->GetShape(grad).ValueOrDie();
+    ctx->set_dynamic_dimension_is_minus_one(true);
+    std::vector<int64> dynamic_size;
+    OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntVector(0, &dynamic_size));
+    // Input of strided_slice_op has to have the same shape as output.
+    DCHECK_EQ(grad_shape.rank(), input_shape.dims());
+    for (int64 dim = 0; dim < input_shape.dims(); ++dim) {
+      DCHECK_EQ(grad_shape.dimensions(dim), input_shape.dim_size(dim));
+      if (dynamic_size[dim] == -1) {
+        // Input is a dynamic dimension, set the same dynamic dimension size in
+        // the output.
+        auto dim_size = xla::Slice(dynamic_shape, {dim}, {dim + 1}, {1});
+        auto dim_size_scalar =
+            xla::Reshape(xla::ShapeUtil::MakeScalarShape(xla::S32), dim_size);
+        grad = xla::SetDimensionSize(grad, dim_size_scalar, dim);
+      } else if (grad_shape.is_dynamic_dimension(dim)) {
+        // Input is static but output is dynamic, respect input and remove any
+        // dynamic dim in the output.
+        grad = xla::RemoveDynamicDimension(grad, dim);
+      }
+    }
+
     ctx->SetOutput(0, grad);
   }
 
diff --git a/tensorflow/compiler/tf2xla/mlir_tf2xla.cc b/tensorflow/compiler/tf2xla/mlir_tf2xla.cc
index 43793be56a7..60d1f3da0c5 100644
--- a/tensorflow/compiler/tf2xla/mlir_tf2xla.cc
+++ b/tensorflow/compiler/tf2xla/mlir_tf2xla.cc
@@ -165,11 +165,6 @@ Status ConvertGraphDefToXlaViaMlir(
   device_set.AddDevice(&device);
   AddDevicesToOp(*module, &device_set);
 
-  if (failed(mlir::TF::MarkFunctionVisibilityUsingEntryFunctionSpecification(
-          *module))) {
-    return errors::Internal("Problem with mark function visibility");
-  }
-
   TF_RETURN_IF_ERROR(mlir::TF::RunBridgeWithStandardPipeline(
       *module, /*enable_logging=*/VLOG_IS_ON(1), /*enable_inliner=*/true));
 
diff --git a/tensorflow/compiler/tf2xla/python/xla.py b/tensorflow/compiler/tf2xla/python/xla.py
index c59c47e92fb..0ebca2d546f 100644
--- a/tensorflow/compiler/tf2xla/python/xla.py
+++ b/tensorflow/compiler/tf2xla/python/xla.py
@@ -37,6 +37,7 @@ from tensorflow.python.ops import gen_math_ops
 from tensorflow.python.ops import gen_random_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import special_math_ops
 
 # TODO(phawkins): provide wrappers for all XLA operators. Currently the missing
 # ops include:
@@ -103,8 +104,8 @@ sign = _unary_op(math_ops.sign)
 tanh = _unary_op(math_ops.tanh)
 
 # Bessel
-bessel_i0e = _unary_op(math_ops.bessel_i0e)
-bessel_i1e = _unary_op(math_ops.bessel_i1e)
+bessel_i0e = _unary_op(special_math_ops.bessel_i0e)
+bessel_i1e = _unary_op(special_math_ops.bessel_i1e)
 
 # Binary operators
 
diff --git a/tensorflow/compiler/tf2xla/xla_compiler.cc b/tensorflow/compiler/tf2xla/xla_compiler.cc
index 1cf3e10b774..c1aef3ff690 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler.cc
+++ b/tensorflow/compiler/tf2xla/xla_compiler.cc
@@ -268,6 +268,7 @@ Status BuildComputation(
               return a->arg_num() < b->arg_num();
             });
 
+  std::vector<xla::XlaBuilder::InputOutputAlias> aliases;
   for (const XlaResource* resource : arg_resources) {
     DCHECK_LT(resource->arg_num(), args.size());
     const XlaCompiler::Argument& arg = args[resource->arg_num()];
@@ -289,20 +290,19 @@ Status BuildComputation(
       update.type = resource->type();
       update.shape = resource->shape();
       update.modified = modified;
-      if (is_entry_computation && always_return_tuple &&
+      if (is_entry_computation &&
           arg.resource_kind != XlaResource::kTensorArray &&
           alias_resource_update) {
         // Assuming tuple arg and results are used.
-        int64 output_index = elems.size();
-        if (use_tuple_arg) {
-          builder->SetUpAlias(/*output_index=*/{output_index},
-                              /*param_number=*/0,
-                              /*param_index=*/{update.input_index});
-        } else {
-          builder->SetUpAlias(/*output_index=*/{output_index},
-                              /*param_number=*/update.input_index,
-                              /*param_index=*/{});
-        }
+        xla::ShapeIndex param_index =
+            use_tuple_arg ? xla::ShapeIndex({update.input_index})
+                          : xla::ShapeIndex{};
+        int param_number = use_tuple_arg ? 0 : update.input_index;
+        int64 output_index_num = elems.size();
+        xla::ShapeIndex output_index = xla::ShapeIndex({output_index_num});
+        VLOG(3) << "Storing alias: " << output_index.ToString() << ": ("
+                << param_number << ", " << param_index.ToString() << ")";
+        aliases.push_back({output_index, param_number, param_index});
       }
       for (const auto& grad : resource->tensor_array_gradients()) {
         update.tensor_array_gradients_accessed.insert(grad.first);
@@ -381,8 +381,25 @@ Status BuildComputation(
     xla::XlaScopedShardingAssignment assign_sharding(builder, op_sharding);
     tuple = xla::Tuple(builder, elems);
   }
-  if (!always_return_tuple && elems.size() == 1) {
+  bool returns_tuple = always_return_tuple || elems.size() != 1;
+  VLOG(3) << "Computation returns a tuple=" << returns_tuple;
+  if (!returns_tuple) {
     xla::GetTupleElement(tuple, 0);
+
+    for (xla::XlaBuilder::InputOutputAlias& alias : aliases) {
+      if (alias.output_index == xla::ShapeIndex({0})) {
+        VLOG(3) << "For aliased parameter " << alias.param_number << ": "
+                << alias.param_index.ToString()
+                << " normalizing output_index from {0} to {}, as a scalar is "
+                   "returned from the cluster";
+        alias.output_index = xla::ShapeIndex({});
+      }
+    }
+  }
+
+  for (xla::XlaBuilder::InputOutputAlias& alias : aliases) {
+    builder->SetUpAlias(alias.output_index, alias.param_number,
+                        alias.param_index);
   }
 
   xla::StatusOr<xla::XlaComputation> computation_status = builder->Build();
diff --git a/tensorflow/compiler/xla/client/local_client.cc b/tensorflow/compiler/xla/client/local_client.cc
index afe115deda8..5fc9909fa2a 100644
--- a/tensorflow/compiler/xla/client/local_client.cc
+++ b/tensorflow/compiler/xla/client/local_client.cc
@@ -176,15 +176,23 @@ StatusOr<ScopedShapedBuffer> LocalExecutable::Run(
   for (const ShapedBuffer* const arg : arguments) {
     argument_shapes.push_back(&arg->on_host_shape());
   }
-  TF_ASSIGN_OR_RETURN(auto options_and_stream,
-                      RunHelper(argument_shapes, run_options));
-  ExecutableRunOptions options = options_and_stream.first.run_options();
-  options.set_device_ordinal(-1);
-  auto result = RunAsync(arguments, options);
-  Status block_status = options.stream()->BlockHostUntilDone();
-  TF_RETURN_IF_ERROR(result.status());
-  TF_RETURN_IF_ERROR(block_status);
-  return result;
+  return AsyncCallAndBlockHostUntilDone<xla::ScopedShapedBuffer>(
+      argument_shapes, run_options, [&](const ExecutableRunOptions& options) {
+        return RunAsync(arguments, options);
+      });
+}
+
+StatusOr<ExecutionOutput> LocalExecutable::Run(
+    std::vector<ExecutionInput> arguments, ExecutableRunOptions run_options) {
+  std::vector<const Shape*> argument_shapes;
+  argument_shapes.reserve(arguments.size());
+  for (const ExecutionInput& arg : arguments) {
+    argument_shapes.push_back(&arg.shape());
+  }
+  return AsyncCallAndBlockHostUntilDone<ExecutionOutput>(
+      argument_shapes, run_options, [&](const ExecutableRunOptions& options) {
+        return RunAsync(argument_shapes, std::move(arguments), options);
+      });
 }
 
 static std::shared_ptr<HloSnapshot> DumpArguments(
@@ -312,6 +320,16 @@ StatusOr<ExecutionOutput> LocalExecutable::RunAsync(
   return std::move(outputs);
 }
 
+StatusOr<ExecutionOutput> LocalExecutable::RunAsync(
+    std::vector<ExecutionInput> arguments, ExecutableRunOptions run_options) {
+  std::vector<const Shape*> argument_shapes;
+  argument_shapes.reserve(arguments.size());
+  for (const ExecutionInput& arg : arguments) {
+    argument_shapes.push_back(&arg.shape());
+  }
+  return RunAsync(argument_shapes, std::move(arguments), run_options);
+}
+
 se::Platform* LocalClient::platform() const {
   return local_service_->backend().platform();
 }
diff --git a/tensorflow/compiler/xla/client/local_client.h b/tensorflow/compiler/xla/client/local_client.h
index 7cdeb9dcbf6..8b91f4a1739 100644
--- a/tensorflow/compiler/xla/client/local_client.h
+++ b/tensorflow/compiler/xla/client/local_client.h
@@ -51,6 +51,11 @@ class LocalExecutable {
       const absl::Span<const ShapedBuffer* const> arguments,
       ExecutableRunOptions run_options);
 
+  // Similar to Run(), but allows for donating argument buffers to the
+  // executable.
+  StatusOr<ExecutionOutput> Run(std::vector<ExecutionInput> arguments,
+                                ExecutableRunOptions run_options);
+
   // Similar to Run(), but need not block the host waiting for the computation
   // to complete before returning.
   StatusOr<ScopedShapedBuffer> RunAsync(
@@ -63,6 +68,9 @@ class LocalExecutable {
       absl::Span<Shape const* const> argument_host_shapes,
       std::vector<ExecutionInput> arguments, ExecutableRunOptions run_options);
 
+  StatusOr<ExecutionOutput> RunAsync(std::vector<ExecutionInput> arguments,
+                                     ExecutableRunOptions run_options);
+
   // Return the options used to build the executable.
   const ExecutableBuildOptions& build_options() const { return build_options_; }
 
@@ -90,6 +98,22 @@ class LocalExecutable {
   // Backend::devices_equivalent).
   int build_device_ordinal() const { return build_options_.device_ordinal(); }
 
+  template <typename T>
+  StatusOr<T> AsyncCallAndBlockHostUntilDone(
+      absl::Span<Shape const* const> argument_shapes,
+      const ExecutableRunOptions& run_options,
+      std::function<StatusOr<T>(const ExecutableRunOptions&)> async_callback) {
+    TF_ASSIGN_OR_RETURN(auto options_and_stream,
+                        RunHelper(argument_shapes, run_options));
+    ExecutableRunOptions options = options_and_stream.first.run_options();
+    options.set_device_ordinal(-1);
+    StatusOr<T> result = async_callback(options);
+    Status block_status = options.stream()->BlockHostUntilDone();
+    TF_RETURN_IF_ERROR(result.status());
+    TF_RETURN_IF_ERROR(block_status);
+    return result;
+  }
+
   // Compiled computation.
   std::unique_ptr<Executable> executable_;
 
diff --git a/tensorflow/compiler/xla/client/xla_builder.cc b/tensorflow/compiler/xla/client/xla_builder.cc
index bfba48862f6..56e9aba6112 100644
--- a/tensorflow/compiler/xla/client/xla_builder.cc
+++ b/tensorflow/compiler/xla/client/xla_builder.cc
@@ -1564,16 +1564,12 @@ XlaOp XlaBuilder::CustomCall(
     const Shape& shape, const string& opaque,
     absl::optional<absl::Span<const Shape>> operand_shapes_with_layout) {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
-    HloInstructionProto instr;
     if (absl::StartsWith(call_target_name, "$")) {
       return InvalidArgument(
           "Invalid custom_call_target \"%s\": Call targets that start with '$' "
           "are reserved for internal use.",
           call_target_name);
     }
-    *instr.mutable_shape() = shape.ToProto();
-    instr.set_custom_call_target(call_target_name);
-    instr.set_backend_config(opaque);
     if (operand_shapes_with_layout.has_value()) {
       if (!LayoutUtil::HasLayout(shape)) {
         return InvalidArgument(
@@ -1586,7 +1582,6 @@ XlaOp XlaBuilder::CustomCall(
             "with constrained layout; given %d shapes, expected %d",
             operand_shapes_with_layout->size(), operands.size());
       }
-      instr.set_constrain_layout(true);
       int64 operand_num = 0;
       for (const Shape& operand_shape : *operand_shapes_with_layout) {
         if (!LayoutUtil::HasLayout(operand_shape)) {
@@ -1595,14 +1590,31 @@ XlaOp XlaBuilder::CustomCall(
               "constrained layout.",
               operand_num);
         }
-        *instr.add_operand_shapes_with_layout() = operand_shape.ToProto();
         ++operand_num;
       }
     }
-    return AddInstruction(std::move(instr), HloOpcode::kCustomCall, operands);
+    return CustomCallInternal(call_target_name, operands, shape, opaque,
+                              operand_shapes_with_layout);
   });
 }
 
+StatusOr<XlaOp> XlaBuilder::CustomCallInternal(
+    const string& call_target_name, absl::Span<const XlaOp> operands,
+    const Shape& shape, const string& opaque,
+    absl::optional<absl::Span<const Shape>> operand_shapes_with_layout) {
+  HloInstructionProto instr;
+  *instr.mutable_shape() = shape.ToProto();
+  instr.set_custom_call_target(call_target_name);
+  instr.set_backend_config(opaque);
+  if (operand_shapes_with_layout.has_value()) {
+    instr.set_constrain_layout(true);
+    for (const Shape& operand_shape : *operand_shapes_with_layout) {
+      *instr.add_operand_shapes_with_layout() = operand_shape.ToProto();
+    }
+  }
+  return AddInstruction(std::move(instr), HloOpcode::kCustomCall, operands);
+}
+
 XlaOp XlaBuilder::CustomCall(
     const string& call_target_name, absl::Span<const XlaOp> operands,
     const XlaComputation& computation, const Shape& shape, const string& opaque,
@@ -2727,13 +2739,34 @@ XlaOp XlaBuilder::GetDimensionSize(XlaOp operand, int64 dimension) {
   });
 }
 
-XlaOp XlaBuilder::SetDimensionSize(XlaOp operand, XlaOp val, int64 dimension) {
+XlaOp XlaBuilder::RemoveDynamicDimension(XlaOp operand, int64 dimension) {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     HloInstructionProto instr;
     TF_ASSIGN_OR_RETURN(const Shape* operand_shape, GetShapePtr(operand));
 
-    TF_ASSIGN_OR_RETURN(Shape shape, ShapeInference::InferSetDimensionSizeShape(
-                                         *operand_shape, dimension));
+    Shape shape = *operand_shape;
+    shape.set_dynamic_dimension(dimension, false);
+    // Setting an op's dynamic dimension to its static size removes the dynamic
+    // dimension.
+    XlaOp static_size =
+        ConstantR0<int32>(this, operand_shape->dimensions(dimension));
+
+    *instr.mutable_shape() = shape.ToProto();
+    instr.add_dimensions(dimension);
+    return AddInstruction(std::move(instr), HloOpcode::kSetDimensionSize,
+                          {operand, static_size});
+  });
+}
+
+XlaOp XlaBuilder::SetDimensionSize(XlaOp operand, XlaOp val, int64 dimension) {
+  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+    TF_ASSIGN_OR_RETURN(const Shape* operand_shape, GetShapePtr(operand));
+    TF_ASSIGN_OR_RETURN(const Shape* val_shape, GetShapePtr(val));
+
+    TF_ASSIGN_OR_RETURN(Shape shape,
+                        ShapeInference::InferSetDimensionSizeShape(
+                            *operand_shape, *val_shape, dimension));
     // Setting an op's dynamic dimension to the static size is a noop.
     TF_ASSIGN_OR_RETURN(const HloInstructionProto* val_proto,
                         LookUpInstruction(val));
@@ -3827,4 +3860,8 @@ XlaOp SetDimensionSize(const XlaOp operand, const XlaOp val, int64 dimension) {
   return operand.builder()->SetDimensionSize(operand, val, dimension);
 }
 
+XlaOp RemoveDynamicDimension(const XlaOp operand, int64 dimension) {
+  return operand.builder()->RemoveDynamicDimension(operand, dimension);
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/client/xla_builder.h b/tensorflow/compiler/xla/client/xla_builder.h
index ffa6a7c3439..3fc26747468 100644
--- a/tensorflow/compiler/xla/client/xla_builder.h
+++ b/tensorflow/compiler/xla/client/xla_builder.h
@@ -527,6 +527,14 @@ class XlaBuilder {
       const Shape& shape_with_layout, const string& opaque,
       absl::optional<absl::Span<const Shape>> operand_shapes_with_layout);
 
+  // Internal version of CustomCall without computation that doesn't do op
+  // specific error handling and expects arguments to be legal. CustomCall
+  // method above calls this method after error handling.
+  virtual StatusOr<XlaOp> CustomCallInternal(
+      const string& call_target_name, absl::Span<const XlaOp> operands,
+      const Shape& shape_with_layout, const string& opaque,
+      absl::optional<absl::Span<const Shape>> operand_shapes_with_layout);
+
   XlaOp CustomCall(
       const string& call_target_name, absl::Span<const XlaOp> operands,
       const XlaComputation& computation, const Shape& shape_with_layout,
@@ -704,6 +712,8 @@ class XlaBuilder {
 
   XlaOp SetDimensionSize(XlaOp operand, XlaOp val, int64 dimension);
 
+  XlaOp RemoveDynamicDimension(XlaOp operand, int64 dimension);
+
   StatusOr<XlaOp> AddInstruction(HloInstructionProto&& instr, HloOpcode opcode,
                                  absl::Span<const XlaOp> operands = {});
 
@@ -1151,6 +1161,7 @@ class XlaBuilder {
 
   friend XlaOp GetDimensionSize(XlaOp operand, int64 dimension);
   friend XlaOp SetDimensionSize(XlaOp operand, XlaOp val, int64 dimension);
+  friend XlaOp RemoveDynamicDimension(XlaOp operand, int64 dimension);
 
  protected:
   // Returns OK status if the given op was built using this builder. Otherwise,
@@ -2149,6 +2160,9 @@ XlaOp GetDimensionSize(XlaOp operand, int64 dimension);
 
 XlaOp SetDimensionSize(XlaOp operand, XlaOp val, int64 dimension);
 
+// Returns the same op but with dynamic dimension removed.
+XlaOp RemoveDynamicDimension(XlaOp operand, int64 dimension);
+
 // Implementation details below this point.
 //
 
diff --git a/tensorflow/compiler/xla/client/xla_builder_test.cc b/tensorflow/compiler/xla/client/xla_builder_test.cc
index 4fa47077fca..7011c946203 100644
--- a/tensorflow/compiler/xla/client/xla_builder_test.cc
+++ b/tensorflow/compiler/xla/client/xla_builder_test.cc
@@ -556,6 +556,32 @@ TEST_F(XlaBuilderTest, DynamicParameter) {
   EXPECT_TRUE(param_shape.is_dynamic_dimension(0));
 }
 
+TEST_F(XlaBuilderTest, SetDimensionSize) {
+  XlaBuilder b(TestName());
+  auto p0 = Parameter(&b, 0, ShapeUtil::MakeShape(F32, {10}), "p0");
+  auto p1 = Parameter(&b, 1, ShapeUtil::MakeShape(S32, {}), "p1");
+  auto set_dim_size = SetDimensionSize(p0, p1, 0);
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          BuildHloModule(&b, /*root=*/set_dim_size));
+  const Shape& root_shape =
+      module->entry_computation()->root_instruction()->shape();
+  EXPECT_TRUE(root_shape.is_dynamic_dimension(0));
+}
+
+TEST_F(XlaBuilderTest, RemoveDimensionSize) {
+  XlaBuilder b(TestName());
+  auto p0 = Parameter(&b, 0, ShapeUtil::MakeShape(F32, {10}), "p0");
+  auto p1 = Parameter(&b, 1, ShapeUtil::MakeShape(S32, {}), "p1");
+  auto set_dim_size = SetDimensionSize(p0, p1, 0);
+  auto remove_dim_size = RemoveDynamicDimension(set_dim_size, 0);
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          BuildHloModule(&b, /*root=*/remove_dim_size));
+  const Shape& root_shape =
+      module->entry_computation()->root_instruction()->shape();
+  // Dynamic dimension has been removed.
+  EXPECT_FALSE(root_shape.is_dynamic_dimension(0));
+}
+
 TEST_F(XlaBuilderTest, DynamicUnary) {
   XlaBuilder b(TestName());
   Shape tuple_param_shape = ShapeUtil::MakeTupleShape(
diff --git a/tensorflow/compiler/xla/debug_options_flags.cc b/tensorflow/compiler/xla/debug_options_flags.cc
index 81655101701..8ca6e2b294c 100644
--- a/tensorflow/compiler/xla/debug_options_flags.cc
+++ b/tensorflow/compiler/xla/debug_options_flags.cc
@@ -64,7 +64,7 @@ DebugOptions DefaultDebugOptionsIgnoringFlags() {
   opts.set_xla_cpu_fast_math_honor_division(true);
 
   // By default, copy TF's Eigen style min_max behavior with nans.
-  opts.set_xla_cpu_enable_fast_min_max(false);
+  opts.set_xla_cpu_enable_fast_min_max(true);
 
   opts.set_xla_gpu_enable_fast_min_max(true);
 
diff --git a/tensorflow/compiler/xla/pjrt/BUILD b/tensorflow/compiler/xla/pjrt/BUILD
index dd50d0577d4..695ba9dee93 100644
--- a/tensorflow/compiler/xla/pjrt/BUILD
+++ b/tensorflow/compiler/xla/pjrt/BUILD
@@ -141,12 +141,15 @@ cc_library(
         "//tensorflow/compiler/xla/service/gpu:gpu_executable_run_options",
         "//tensorflow/core:allocator",
         "//tensorflow/core:lib",
+        "//tensorflow/core/profiler/lib:connected_traceme",
         "//tensorflow/core/profiler/lib:traceme",
+        "//tensorflow/core/profiler/lib:traceme_encode",
         "//tensorflow/stream_executor:event",
         "//tensorflow/stream_executor:stream",
         "//tensorflow/stream_executor/host:host_platform_id",
         "//tensorflow/stream_executor/lib",
         "@com_google_absl//absl/base",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/memory",
@@ -154,6 +157,7 @@ cc_library(
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/time",
+        "@com_google_absl//absl/types:optional",
         "@com_google_absl//absl/types:span",
     ],
 )
diff --git a/tensorflow/compiler/xla/pjrt/distributed/service.h b/tensorflow/compiler/xla/pjrt/distributed/service.h
index 725a76791ce..9ecbdb3cc7c 100644
--- a/tensorflow/compiler/xla/pjrt/distributed/service.h
+++ b/tensorflow/compiler/xla/pjrt/distributed/service.h
@@ -54,15 +54,15 @@ class DistributedRuntimeServiceImpl final
 
   absl::Mutex mu_;
   enum class State { kInitializing, kRunning };
-  State state_ GUARDED_BY(mu_) = State::kInitializing;
+  State state_ ABSL_GUARDED_BY(mu_) = State::kInitializing;
 
-  std::vector<LocalTopologyProto> local_topologies_ GUARDED_BY(mu_);
-  GlobalTopologyProto topology_ GUARDED_BY(mu_);
+  std::vector<LocalTopologyProto> local_topologies_ ABSL_GUARDED_BY(mu_);
+  GlobalTopologyProto topology_ ABSL_GUARDED_BY(mu_);
   struct Node {
     bool present = false;
   };
-  int num_nodes_present_ GUARDED_BY(mu_) = 0;
-  std::vector<Node> nodes_ GUARDED_BY(mu_);
+  int num_nodes_present_ ABSL_GUARDED_BY(mu_) = 0;
+  std::vector<Node> nodes_ ABSL_GUARDED_BY(mu_);
 
   KeyValueStore key_value_store_;
 };
diff --git a/tensorflow/compiler/xla/pjrt/local_device_state.cc b/tensorflow/compiler/xla/pjrt/local_device_state.cc
index d173c891c95..a229e56001e 100644
--- a/tensorflow/compiler/xla/pjrt/local_device_state.cc
+++ b/tensorflow/compiler/xla/pjrt/local_device_state.cc
@@ -127,11 +127,15 @@ std::unique_ptr<se::Stream> LocalDeviceState::BorrowStreamFromPool() {
   } else {
     std::unique_ptr<se::Stream> stream = std::move(usage_stream_pool_.top());
     usage_stream_pool_.pop();
+    stream->RefreshStatus().IgnoreError();  // Can return error::Unimplemented
+    QCHECK(stream->ok());
     return stream;
   }
 }
 
 void LocalDeviceState::ReturnStreamToPool(std::unique_ptr<se::Stream> stream) {
+  stream->RefreshStatus().IgnoreError();  // Can return error::Unimplemented
+  QCHECK(stream->ok());
   absl::MutexLock lock(&mu_);
   usage_stream_pool_.push(std::move(stream));
 }
diff --git a/tensorflow/compiler/xla/pjrt/nvidia_gpu_device.cc b/tensorflow/compiler/xla/pjrt/nvidia_gpu_device.cc
index de760af8fd9..edffaf6c877 100644
--- a/tensorflow/compiler/xla/pjrt/nvidia_gpu_device.cc
+++ b/tensorflow/compiler/xla/pjrt/nvidia_gpu_device.cc
@@ -169,7 +169,7 @@ class NcclIdStore {
   const std::shared_ptr<DistributedRuntimeClient> client_;
 
   absl::Mutex mu_;
-  absl::flat_hash_map<std::string, std::string> cache_ GUARDED_BY(mu_);
+  absl::flat_hash_map<std::string, std::string> cache_ ABSL_GUARDED_BY(mu_);
 };
 
 StatusOr<std::string> NcclIdStore::GetNcclUniqueId(const NcclCliqueKey& key) {
diff --git a/tensorflow/compiler/xla/pjrt/pjrt_client.cc b/tensorflow/compiler/xla/pjrt/pjrt_client.cc
index ccb72b7ce30..e341a11d64f 100644
--- a/tensorflow/compiler/xla/pjrt/pjrt_client.cc
+++ b/tensorflow/compiler/xla/pjrt/pjrt_client.cc
@@ -76,11 +76,13 @@ limitations under the License.
 #include "absl/strings/str_format.h"
 #include "absl/synchronization/mutex.h"
 #include "absl/time/time.h"
+#include "absl/types/optional.h"
 #include "absl/types/span.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/cpu_function_runtime.h"
 #include "tensorflow/compiler/xla/executable_run_options.h"
+#include "tensorflow/compiler/xla/layout.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/pjrt/distributed/protocol.pb.h"
@@ -98,7 +100,9 @@ limitations under the License.
 #include "tensorflow/core/platform/mem.h"
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/profiler/lib/connected_traceme.h"
 #include "tensorflow/core/profiler/lib/traceme.h"
+#include "tensorflow/core/profiler/lib/traceme_encode.h"
 #include "tensorflow/stream_executor/device_memory.h"
 #include "tensorflow/stream_executor/device_memory_allocator.h"
 #include "tensorflow/stream_executor/event.h"
@@ -749,16 +753,22 @@ StatusOr<std::unique_ptr<PjRtBuffer>> PjRtBuffer::FromHostLiteral(
     // memory that has already been allocated, and a possible Event
     // allocation.
 
+    se::Stream* h2d_stream = local_device->host_to_device_stream();
     ShapedBuffer buffer = device_buffer->AsShapedBuffer(
         compact_shape, on_device_shape, client->client()->platform());
     TF_CHECK_OK(transfer_manager->TransferLiteralToDeviceAsync(
-        local_device->host_to_device_stream(), literal, buffer));
+        h2d_stream, literal, buffer));
 
     std::shared_ptr<BufferSequencingEvent> event =
         device_buffer->definition_events()[0];
     TF_CHECK_OK(AddDestinationBufferSynchronization(
-        local_device, std::move(device_buffer), event,
-        local_device->host_to_device_stream()));
+        local_device, std::move(device_buffer), event, h2d_stream));
+
+    // This can sometimes catch the case where the literal memory has been
+    // freed before the H2D transfer was issued.
+    h2d_stream->RefreshStatus()
+        .IgnoreError();  // Can return error::Unimplemented
+    QCHECK(h2d_stream->ok());
   };
   client->h2d_transfer_pool()->Schedule(transfer_h2d);
   return py_buffer;
@@ -853,10 +863,10 @@ StatusOr<std::shared_ptr<TrackedDeviceBuffer>> PjRtBuffer::Release(
     if (device_buffer_ == nullptr) {
       return std::shared_ptr<TrackedDeviceBuffer>();
     }
-    // Set host_value_ and device_buffer_ to null now so that no other thread
-    // can add a hold while we are in WaitForOutstandingUsageHolds()
+    // Clear host_values_ and set device_buffer_ to null now so that no other
+    // thread can add a hold while we are in WaitForOutstandingUsageHolds()
     // below.
-    host_value_ = nullptr;
+    host_values_.clear();
     std::swap(device_buffer_, device_buffer);
     WaitForOutstandingUsageHolds();
     // Now that all holds have completed and no more can be added, we can get
@@ -991,7 +1001,7 @@ void PjRtBuffer::ConfirmDonation(TrackedDeviceBuffer* device_buffer) {
     device_buffer->ReleaseDeviceMemory();
     // Make *this invalid so it can't be used again. Any threads blocking in
     // Release or GetBufferWithHold will see an invalid buffer and return.
-    host_value_ = nullptr;
+    host_values_.clear();
     device_buffer_.reset();
   }
   // Unblock another thread, if any, trying to get a donation hold.
@@ -1011,7 +1021,14 @@ void PjRtBuffer::DropHold(ScopedHold::Type type, TrackedDeviceBuffer* buffer) {
   }
 }
 
-Status PjRtBuffer::CopyToHostAsync() {
+Status PjRtBuffer::CopyToHostAsync(absl::optional<xla::Layout> layout) {
+  return CopyToHostAsyncInternal(/*discard_cached_copy=*/false, layout)
+      .status();
+}
+
+StatusOr<std::shared_ptr<PjRtBuffer::HostValue>>
+PjRtBuffer::CopyToHostAsyncInternal(bool discard_cached_copy,
+                                    absl::optional<xla::Layout> layout) {
   if (IsEmptyTuple()) {
     return InvalidArgument("CopyToHostAsync called on empty tuple");
   }
@@ -1019,6 +1036,8 @@ Status PjRtBuffer::CopyToHostAsync() {
   std::shared_ptr<HostValue> host_value;
   LocalDeviceState* local_device = device_->local_device_state();
   se::Stream* stream = local_device->GetDeviceToHostStream();
+  const xla::Layout& host_layout =
+      layout.has_value() ? layout.value() : on_host_shape_.layout();
   {
     absl::MutexLock lock(&mu_);
     // We can't perform any other action while a donation hold is in progress.
@@ -1026,17 +1045,36 @@ Status PjRtBuffer::CopyToHostAsync() {
     if (device_buffer_ == nullptr) {
       return InvalidArgument("CopyToHostAsync() called on invalid buffer.");
     }
-    if (host_value_) {
-      // The host value has already been requested or is available.
-      return Status::OK();
+    if (discard_cached_copy) {
+      auto it = host_values_.find(host_layout);
+      if (it != host_values_.end()) {
+        host_value = it->second;
+        host_values_.erase(it);
+        return host_value;
+      } else {
+        host_value = std::make_shared<HostValue>();
+      }
+    } else {
+      std::shared_ptr<HostValue>& host_value_ref = host_values_[host_layout];
+      if (host_value_ref) {
+        return host_value_ref;
+      }
+      host_value = host_value_ref = std::make_shared<HostValue>();
     }
-    host_value = host_value_ = std::make_shared<HostValue>();
     AcquireHoldLocked(&device_buffer);
   }
   WaitForBufferDefinitionEventsOnStream(*device_buffer, stream);
-  host_value->value = std::make_shared<Literal>(on_host_shape_);
+  Shape host_shape;
+  if (layout.has_value()) {
+    host_shape = ShapeUtil::MakeShape(on_host_shape_.element_type(),
+                                      on_host_shape_.dimensions());
+    *host_shape.mutable_layout() = host_layout;
+  } else {
+    host_shape = on_host_shape_;
+  }
+  host_value->value = std::make_shared<Literal>(host_shape);
   ShapedBuffer shaped_buffer = device_buffer->AsShapedBuffer(
-      on_host_shape_, on_device_shape_, client_->client()->platform());
+      host_shape, on_device_shape_, client_->client()->platform());
   client_->client()->backend().transfer_manager()->TransferLiteralFromDevice(
       stream, shaped_buffer, host_value->value.get(),
       [host_value](Status done_status) {
@@ -1066,17 +1104,14 @@ Status PjRtBuffer::CopyToHostAsync() {
   RecordUsage(std::move(device_buffer), local_device, local_device, usage_event,
               stream,
               /*prefer_to_retain_reference=*/true);
-  return Status::OK();
+  return host_value;
 }
 
-StatusOr<std::shared_ptr<Literal>> PjRtBuffer::ToLiteral() {
+StatusOr<std::shared_ptr<Literal>> PjRtBuffer::ToLiteral(
+    const bool discard_cached_copy, absl::optional<xla::Layout> layout) {
   tensorflow::profiler::TraceMe traceme("PjRtBuffer::ToLiteral");
-  TF_RETURN_IF_ERROR(CopyToHostAsync());
-  std::shared_ptr<HostValue> host_value;
-  {
-    absl::MutexLock lock(&mu_);
-    host_value = host_value_;
-  }
+  TF_ASSIGN_OR_RETURN(std::shared_ptr<HostValue> host_value,
+                      CopyToHostAsyncInternal(discard_cached_copy, layout));
   if (host_value == nullptr) {
     return InvalidArgument("ToLiteral called on invalid buffer");
   }
@@ -1429,10 +1464,9 @@ StatusOr<ScopedShapedBuffer> PjRtExecutable::EnqueueExecution(
     int executable_idx, const RunId& run_id, const ExecuteOptions& options,
     Device* device, std::vector<PjRtBuffer::ScopedHold>* device_buffers) const {
   int device_ordinal = device->local_device_state()->device_ordinal();
-  tensorflow::profiler::TraceMe traceme([&] {
-    return absl::StrCat("LocalExecutable::Execute#run_id=", run_id.ToInt(),
-                        "#");
-  });
+  tensorflow::profiler::TraceMeConsumer activity(
+      "LocalExecutable::Execute", tensorflow::profiler::ContextType::kPjRt,
+      run_id.ToInt());
   VLOG(3) << "Replica " << replica << ", partition " << partition
           << " mapped to device ordinal for execution: " << device_ordinal;
 
@@ -1721,10 +1755,9 @@ PjRtExecutable::ExecuteOnLocalDevices(
     absl::Span<const std::vector<PjRtBuffer*>> argument_handles,
     const ExecuteOptions& options) const {
   RunId run_id;
-  tensorflow::profiler::TraceMe traceme([&] {
-    return absl::StrCat(
-        "LocalExecutable::ExecuteOnLocalDevices#run_id=", run_id.ToInt(), "#");
-  });
+  tensorflow::profiler::TraceMeProducer activity(
+      "LocalExecutable::ExecuteOnLocalDevices",
+      tensorflow::profiler::ContextType::kPjRt, run_id.ToInt());
 
   const int num_local_devices = local_devices_.size();
 
diff --git a/tensorflow/compiler/xla/pjrt/pjrt_client.h b/tensorflow/compiler/xla/pjrt/pjrt_client.h
index 754eb19bec6..c50d09f631c 100644
--- a/tensorflow/compiler/xla/pjrt/pjrt_client.h
+++ b/tensorflow/compiler/xla/pjrt/pjrt_client.h
@@ -20,15 +20,18 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/container/inlined_vector.h"
 #include "absl/strings/string_view.h"
 #include "absl/synchronization/mutex.h"
 #include "absl/synchronization/notification.h"
+#include "absl/types/optional.h"
 #include "absl/types/span.h"
 #include "tensorflow/compiler/xla/client/executable_build_options.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/client/xla_computation.h"
+#include "tensorflow/compiler/xla/layout.h"
 #include "tensorflow/compiler/xla/pjrt/local_device_state.h"
 #include "tensorflow/compiler/xla/pjrt/tracked_device_buffer.h"
 #include "tensorflow/compiler/xla/service/computation_placer.h"
@@ -478,13 +481,20 @@ class PjRtBuffer {
 
   // Returns the buffer's value as an XLA Literal. If the value has previously
   // been prefetched to the host, then returns the prefetched version, otherwise
-  // copies the buffer to the host. Blocks until the value is ready.
-  StatusOr<std::shared_ptr<Literal>> ToLiteral();
+  // copies the buffer to the host. Blocks until the value is ready. If
+  // `discard_cached_copy` is true then buffer will no longer keep hold of a
+  // cached copy of the literal (i.e. The reference to the host value will be
+  // removed.) If a layout is passed than a literal with this layout will be
+  // returned.
+  StatusOr<std::shared_ptr<Literal>> ToLiteral(
+      bool discard_cached_copy = false,
+      absl::optional<xla::Layout> layout = {});
 
   // Initiates a copy of the buffer to the host. Does not block waiting for
   // the transfer to complete. The value can be retrieved by a later call to
-  // ToLiteral().
-  Status CopyToHostAsync();
+  // ToLiteral(). If a layout is passed then a cached copy with this layout will
+  // be created.
+  Status CopyToHostAsync(absl::optional<xla::Layout> layout = {});
 
   // Drops the buffer's reference to its associated device memory, leaving the
   // buffer in an invalid state. The memory will be freed lazily when all async
@@ -592,6 +602,14 @@ class PjRtBuffer {
   // successfully donated to an execution.
   void ConfirmDonation(TrackedDeviceBuffer* device_buffer);
 
+  // Initiates a copy of the buffer to the host. Does not block waiting for
+  // the transfer to complete. A host value is returned and if
+  // `discard_cached_copy` is false stored in an internal buffer so that future
+  // transfers don't have to transfer the data from host again. If a layout is
+  // passed then a literal of this layout will be returned and possibly cached.
+  StatusOr<std::shared_ptr<HostValue>> CopyToHostAsyncInternal(
+      bool discard_cached_copy, absl::optional<xla::Layout> layout);
+
   // Drops a hold without taking any other action. Does a sanity check that
   // buffer==device_buffer_ or device_buffer_==nullptr.
   void DropHold(ScopedHold::Type type, TrackedDeviceBuffer* buffer);
@@ -610,6 +628,8 @@ class PjRtBuffer {
 
   mutable absl::Mutex mu_;
   std::shared_ptr<TrackedDeviceBuffer> device_buffer_ TF_GUARDED_BY(mu_);
+  absl::flat_hash_map<xla::Layout, std::shared_ptr<HostValue>> host_values_
+      TF_GUARDED_BY(mu_);
   std::shared_ptr<HostValue> host_value_ TF_GUARDED_BY(mu_);
   // Count of holds on the buffer.
   std::array<int, ScopedHold::Type::kMaxValue> holds_ TF_GUARDED_BY(mu_);
diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD
index acd35cbc153..10e2d7e65d1 100644
--- a/tensorflow/compiler/xla/service/BUILD
+++ b/tensorflow/compiler/xla/service/BUILD
@@ -1202,6 +1202,9 @@ cc_library(
     srcs = ["transfer_manager.cc"],
     hdrs = ["transfer_manager.h"],
     deps = [
+        ":compiler",
+        ":executable",
+        ":maybe_owning_device_memory",
         ":shaped_buffer",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
@@ -1210,8 +1213,6 @@ cc_library(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
-        "//tensorflow/compiler/xla/service:executable",
-        "//tensorflow/compiler/xla/service:maybe_owning_device_memory",
         "//tensorflow/core:lib",
         "//tensorflow/core:stream_executor_no_cuda",
         "//tensorflow/stream_executor:device_memory",
@@ -1679,6 +1680,7 @@ cc_library(
     hdrs = ["multi_output_fusion.h"],
     deps = [
         ":hlo",
+        ":hlo_dce",
         ":hlo_pass",
         ":hlo_reachability",
         "//tensorflow/compiler/xla:debug_options_flags",
diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier.cc b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
index ce2a801fccd..130661bf1cd 100755
--- a/tensorflow/compiler/xla/service/algebraic_simplifier.cc
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
@@ -2815,6 +2815,28 @@ Status AlgebraicSimplifierVisitor::HandleCompare(HloInstruction* compare) {
   HloInstruction* lhs;
   HloInstruction* rhs;
   CHECK(Match(compare, m::Compare(m::Op(&lhs), m::Op(&rhs))));
+  {
+    // compare(broadcast(a) + x, broadcast(b)) ==>
+    //   compare(x, broadcast(b-a))
+    HloInstruction *x, *a, *b;
+    if (Match(compare,
+              m::Compare(
+                  m::AddAnyOrder(m::Op(&x), m::Broadcast(m::Op(&a).WithShape(
+                                                m::Shape().IsScalar()))),
+                  m::Broadcast(m::Op(&b).WithShape(m::Shape().IsScalar()))))) {
+      if (ShapeUtil::ElementIsSigned(x->shape())) {
+        HloInstruction* sub =
+            computation_->AddInstruction(HloInstruction::CreateBinary(
+                b->shape(), HloOpcode::kSubtract, b, a));
+        HloInstruction* broadcast = computation_->AddInstruction(
+            HloInstruction::CreateBroadcast(x->shape(), sub, {}));
+        HloInstruction* new_compare = computation_->AddInstruction(
+            HloInstruction::CreateCompare(compare->shape(), x, broadcast,
+                                          compare->comparison_direction()));
+        return ReplaceInstruction(compare, new_compare);
+      }
+    }
+  }
 
   if (compare->comparison_direction() == ComparisonDirection::kLt &&
       lhs->opcode() == HloOpcode::kIota && IsAll(rhs, 0)) {
diff --git a/tensorflow/compiler/xla/service/conditional_code_motion.cc b/tensorflow/compiler/xla/service/conditional_code_motion.cc
index eecdcc851e9..6db4c3eb6d4 100644
--- a/tensorflow/compiler/xla/service/conditional_code_motion.cc
+++ b/tensorflow/compiler/xla/service/conditional_code_motion.cc
@@ -106,7 +106,6 @@ class BranchVisitor {
         boundaries_.emplace_back(operand, i, inst);
         continue;
       }
-
       worklist_.push_back(operand);
       visited_.insert(operand);
     }
@@ -197,6 +196,7 @@ bool WorthHoisting(HloInstruction* instruction) {
     case HloOpcode::kMultiply:
     case HloOpcode::kDivide:
     case HloOpcode::kTuple:
+    case HloOpcode::kSqrt:
     case HloOpcode::kGetTupleElement:
       return true;
     default:
@@ -206,10 +206,11 @@ bool WorthHoisting(HloInstruction* instruction) {
 
 // Compare if the instructions to be visited at each branches are identical.
 bool InstructionWithinBranchIdentical(
-    const std::vector<HloInstruction*>& instructions, bool is_layout_senstive) {
+    const std::vector<HloInstruction*>& instructions,
+    bool is_layout_sensitive) {
   // Identical includes the shape of each operands are equal.
   auto eq_operand = [&](const HloInstruction* a, const HloInstruction* b) {
-    bool eq_operands = is_layout_senstive
+    bool eq_operands = is_layout_sensitive
                            ? ShapeUtil::Equal(a->shape(), b->shape())
                            : ShapeUtil::Compatible(a->shape(), b->shape());
     return eq_operands;
@@ -233,7 +234,7 @@ bool InstructionWithinBranchIdentical(
           auto old_channel_id = instruction->channel_id();
           instruction->set_channel_id(instructions[0]->channel_id());
           bool eq_instructions = instructions[0]->Identical(
-              *instruction, eq_operand, eq_computations, is_layout_senstive);
+              *instruction, eq_operand, eq_computations, is_layout_sensitive);
           instruction->set_channel_id(old_channel_id);
           return eq_instructions;
         });
@@ -243,7 +244,7 @@ bool InstructionWithinBranchIdentical(
                      [&](HloInstruction* instruction) {
                        return instructions[0]->Identical(
                            *instruction, eq_operand, eq_computations,
-                           is_layout_senstive);
+                           is_layout_sensitive);
                      });
 }
 
@@ -354,12 +355,228 @@ Status RemoveInstructionFromComputation(
   return Status::OK();
 }
 
+// Identify converts to be hoisted/rematerialized out of the branch
+// computations.
+absl::flat_hash_set<int64> FindSpecialConverts(HloInstruction* old_root,
+                                               int branch_count,
+                                               HloInstruction* conditional,
+                                               bool is_layout_sensitive) {
+  absl::flat_hash_set<int64> kspecial_convert;
+  for (int64 operand_num = 0; operand_num < old_root->operand_count();
+       ++operand_num) {
+    if (old_root->operand(operand_num)->opcode() != HloOpcode::kConvert) {
+      continue;
+    }
+    bool replica = true;
+    HloInstruction* kspecial_convert_candidate =
+        old_root->mutable_operand(operand_num);
+    // Check whether an identical candidate appears in other branches
+    for (int others = 1; others < branch_count; ++others) {
+      HloInstruction* others_root =
+          conditional->branch_computation(others)->root_instruction();
+      bool eq_shape =
+          is_layout_sensitive
+              ? ShapeUtil::Equal(others_root->operand(operand_num)->shape(),
+                                 kspecial_convert_candidate->shape())
+              : ShapeUtil::Compatible(
+                    others_root->operand(operand_num)->shape(),
+                    kspecial_convert_candidate->shape());
+      if ((others_root->operand(operand_num)->opcode() ==
+           HloOpcode::kConvert) &&
+          eq_shape) {
+        // Nothing to be done.
+      } else {
+        replica = false;
+        break;
+      }
+    }
+    if (replica) {
+      kspecial_convert.insert(operand_num);
+    }
+  }
+  return kspecial_convert;
+}
+
+// Restructuring the conditional instruction as follows:
+// i.e., %result = conditional() becomes
+// x = conditional()
+// y.{0..n} = gte(x, {0..n})
+// z = tuple(y.0, y.1, ...y.n)
+// Doing so ensures that we can accommodate the possible shape-change of the
+// conditional when the instructions are hoisted.
+Status RestructureConditionalInstruction(HloComputation* computation,
+                                         HloInstruction* conditional) {
+  HloInstruction* old_root = computation->root_instruction();
+  std::vector<HloInstruction*> new_operands;
+  int cur_index = 0;
+  for (; cur_index < ShapeUtil::TupleElementCount(conditional->shape());
+       ++cur_index) {
+    new_operands.push_back(
+        computation->AddInstruction(HloInstruction::CreateGetTupleElement(
+            ShapeUtil::GetTupleElementShape(conditional->shape(), cur_index),
+            conditional, cur_index)));
+  }
+  HloInstruction* new_tuple =
+      computation->AddInstruction(HloInstruction::CreateTuple(new_operands));
+  if (old_root == conditional) {
+    computation->set_root_instruction(new_tuple);
+  } else {
+    std::vector<HloInstruction*> new_tuple_users;
+    for (auto conditional_user : conditional->users()) {
+      auto is_new_gte = absl::c_find_if(
+          new_operands,
+          [&](HloInstruction* instr) { return instr == conditional_user; });
+      if (is_new_gte == new_operands.end()) {
+        new_tuple_users.push_back(conditional_user);
+      }
+    }
+    for (auto new_tuple_user : new_tuple_users) {
+      TF_RETURN_IF_ERROR(
+          conditional->ReplaceUseWith(new_tuple_user, new_tuple));
+    }
+  }
+  VLOG(2) << "computation after root restructure:\n" << computation->ToString();
+  return Status::OK();
+}
+
+StatusOr<bool> ConvertSpecialMove(HloInstruction* conditional,
+                                  bool is_layout_sensitive) {
+  int branch_count = conditional->branch_count();
+  if (branch_count <= 0) {
+    return false;
+  }
+
+  HloInstruction* old_root =
+      conditional->branch_computation(0)->root_instruction();
+  if (old_root->opcode() != HloOpcode::kTuple) {
+    return false;
+  } else {
+    VLOG(2) << "BEFORE :" << conditional->parent()->parent()->ToString();
+    // Identify the gte using `index'.
+    auto find_gte = [](const HloInstruction* conditional_result,
+                       int64 index) -> HloInstruction* {
+      for (HloInstruction* instr : conditional_result->users()) {
+        if (instr->opcode() != HloOpcode::kGetTupleElement) {
+          return nullptr;
+        }
+        if (instr->tuple_index() == index) {
+          return instr;
+        }
+      }
+      return nullptr;
+    };
+
+    // Captures tuple indices refering to converts to be rematerialized/hoisted.
+    absl::flat_hash_set<int64> kspecial_convert = FindSpecialConverts(
+        old_root, branch_count, conditional, is_layout_sensitive);
+
+    // Exit if we cannot find any converts to be hoisted.
+    if (kspecial_convert.empty()) {
+      return false;
+    }
+
+    TF_RETURN_IF_ERROR(
+        RestructureConditionalInstruction(conditional->parent(), conditional));
+
+    for (int branch = 0; branch < branch_count; branch++) {
+      old_root = conditional->branch_computation(branch)->root_instruction();
+      absl::flat_hash_map<HloInstruction*, int64> map_inst_to_tuple_index;
+      std::vector<HloInstruction*> new_operands(old_root->operand_count());
+      std::unordered_set<HloInstruction*> to_hoist_set;
+
+      for (int64 operand_num = 0; operand_num < old_root->operand_count();
+           ++operand_num) {
+        map_inst_to_tuple_index[old_root->mutable_operand(operand_num)] =
+            operand_num;
+      }
+      for (int64 operand_num = 0; operand_num < old_root->operand_count();
+           ++operand_num) {
+        HloInstruction* hoist = old_root->mutable_operand(operand_num);
+        if (!kspecial_convert.contains(operand_num)) {
+          new_operands[operand_num] = old_root->mutable_operand(operand_num);
+          continue;
+        }
+
+        to_hoist_set.insert(hoist);
+        int64 new_tuple_count = old_root->operand_count();
+
+        // Replace the hoisted instr in the tuple with the operand/operands.
+        // We will replace at least one of the operands of the hoist at the
+        // tuple place; the rest will be added at the end.
+        bool inplace = true;
+        CHECK(!hoist->operands().empty());
+        for (HloInstruction* prod : hoist->operands()) {
+          if (inplace) {
+            map_inst_to_tuple_index[prod] = map_inst_to_tuple_index[hoist];
+            new_operands[map_inst_to_tuple_index[hoist]] = prod;
+            inplace = false;
+          } else {
+            map_inst_to_tuple_index[prod] = new_tuple_count++;
+            new_operands.push_back(prod);
+          }
+        }
+      }
+
+      // Create the new root instruction.
+      HloComputation* cur_branch = conditional->branch_computation(branch);
+      HloInstruction* new_branch_root =
+          cur_branch->AddInstruction(HloInstruction::CreateTuple(new_operands));
+      // The shape can vary since the operands to convert are now
+      // being returned through the branches' root.
+      cur_branch->set_root_instruction(new_branch_root, true /*new shape*/);
+      TF_CHECK_OK(cur_branch->RemoveInstruction(old_root));
+
+      // Only one of the branches needs to change the conditional->parent().
+      if (branch != 0) {
+        continue;
+      }
+      HloComputation* conditional_parent = conditional->parent();
+      HloInstruction* newconditional =
+          conditional_parent->AddInstruction(HloInstruction::CreateConditional(
+              cur_branch->root_instruction()->shape(),
+              conditional->mutable_operand(0),
+              absl::MakeSpan(conditional->branch_computations()),
+              absl::MakeSpan(conditional->operands()).subspan(1)));
+      // Ensure that all the users of conditional refer to the new one.
+      TF_RETURN_IF_ERROR(
+          conditional->ReplaceAllUsesWithDifferentShape(newconditional));
+      TF_CHECK_OK(conditional_parent->RemoveInstruction(conditional));
+      conditional = newconditional;
+      // Add the hoisted instructions in the parent.
+      for (HloInstruction* hoist : to_hoist_set) {
+        VLOG(2) << "Hoisting instruction:" << hoist->ToString();
+        int64 hoist_index = map_inst_to_tuple_index[hoist];
+        // Find out the gte that captured the hoisted instr result.
+        HloInstruction* gte_hoist = find_gte(conditional, hoist_index);
+        CHECK(gte_hoist != nullptr);
+        std::vector<HloInstruction*> new_operands;
+        for (HloInstruction* op : hoist->operands()) {
+          HloInstruction* gte = conditional_parent->AddInstruction(
+              HloInstruction::CreateGetTupleElement(
+                  op->shape(), conditional, map_inst_to_tuple_index[op]));
+          new_operands.push_back(gte);
+        }
+        HloInstruction* hoisted = conditional_parent->AddInstruction(
+            hoist->CloneWithNewOperands(hoist->shape(), new_operands));
+        VLOG(2) << "Hoisted instruction in parent:" << hoisted->ToString();
+        TF_RETURN_IF_ERROR(gte_hoist->ReplaceAllUsesWith(hoisted));
+        TF_CHECK_OK(conditional_parent->RemoveInstruction(gte_hoist));
+      }
+      // No need to explicitly delete a hoisted instruction since if its dead
+      // then the subsequent DCE will remove it.
+    }
+  }
+  VLOG(2) << "AFTER :" << conditional->parent()->parent()->ToString();
+  return true;
+}
+
 // Hoist identical ops out of the conditional. The definition of identical
 // are the shape of the operands are identical and their properties are
 // identical. Will start from the root instruction of each branch and get
 // the identical ops to hoist.
 StatusOr<bool> MergeIdenticalElements(HloInstruction* conditional,
                                       bool is_layout_sensitive) {
+  VLOG(1) << " visiting conditional:" << conditional->ToString();
   int branch_count = conditional->branch_count();
   if (branch_count <= 0) {
     return false;
@@ -399,7 +616,7 @@ StatusOr<bool> MergeIdenticalElements(HloInstruction* conditional,
     }
   }
 
-  if (visitors[0].HoistInstructionSize() <= 1) {
+  if (visitors[0].HoistInstructionSize() < 1) {
     return false;
   }
 
@@ -442,7 +659,6 @@ StatusOr<bool> MergeIdenticalElements(HloInstruction* conditional,
         RemoveInstructionFromComputation(visitors[i].instructions_to_hoist(),
                                          conditional->branch_computation(i)));
   }
-
   return true;
 }
 
@@ -451,26 +667,55 @@ StatusOr<bool> MergeIdenticalElements(HloInstruction* conditional,
 StatusOr<bool> ConditionalCodeMotion::Run(HloModule* module) {
   bool changed = false;
 
-  // Gather all the conditional ops in our module. We do this ahead of time so
-  // we don't have to worry about mutating the lists of computations or
-  // instructions as we iterate.
-  std::vector<HloInstruction*> conditional_ops;
-  for (auto* comp : module->MakeComputationPostOrder()) {
-    for (auto* instr : comp->MakeInstructionPostOrder()) {
-      if (instr->opcode() == HloOpcode::kConditional) {
-        conditional_ops.push_back(instr);
+  if (pursue_full_conditional_code_motion_) {
+    std::vector<HloInstruction*> conditional_ops;
+    for (auto* comp : module->MakeComputationPostOrder()) {
+      for (auto* instr : comp->MakeInstructionPostOrder()) {
+        if (instr->opcode() == HloOpcode::kConditional) {
+          conditional_ops.push_back(instr);
+        }
       }
     }
+
+    for (HloInstruction* conditional_op : conditional_ops) {
+      TF_ASSIGN_OR_RETURN(
+          bool result,
+          MergeIdenticalElements(conditional_op, is_layout_sensitive_));
+      changed |= result;
+    }
+
+    if (changed) {
+      HloPassPipeline subpipeline("after_conditional_code_motion");
+      subpipeline.AddPass<HloDCE>();
+      subpipeline.AddPass<TupleSimplifier>();
+      subpipeline.AddPass<HloDCE>();
+      TF_ASSIGN_OR_RETURN(bool cleanup_changed, subpipeline.Run(module));
+      changed |= cleanup_changed;
+    }
   }
 
-  for (HloInstruction* conditional_op : conditional_ops) {
-    TF_ASSIGN_OR_RETURN(bool result, MergeIdenticalElements(
-                                         conditional_op, is_layout_sensitive_));
-    changed |= result;
+  // handling convert rematerialization/hoisting
+  {
+    std::vector<HloInstruction*> conditional_ops;
+    for (auto* comp : module->MakeComputationPostOrder()) {
+      for (auto* instr : comp->MakeInstructionPostOrder()) {
+        if (instr->opcode() == HloOpcode::kConditional) {
+          conditional_ops.push_back(instr);
+        }
+      }
+    }
+    for (HloInstruction* conditional_op : conditional_ops) {
+      TF_ASSIGN_OR_RETURN(
+          bool convert_result,
+          ConvertSpecialMove(conditional_op, is_layout_sensitive_));
+      changed |= convert_result;
+    }
   }
 
   if (changed) {
-    HloPassPipeline subpipeline("after_conditional_code_motion");
+    HloPassPipeline subpipeline(
+        "after_conditional_code_motion_after_convert_hoisting");
+    subpipeline.AddPass<HloDCE>();
     subpipeline.AddPass<TupleSimplifier>();
     subpipeline.AddPass<HloDCE>();
     TF_ASSIGN_OR_RETURN(bool cleanup_changed, subpipeline.Run(module));
diff --git a/tensorflow/compiler/xla/service/conditional_code_motion.h b/tensorflow/compiler/xla/service/conditional_code_motion.h
index 1197a8b3620..95f02833e15 100644
--- a/tensorflow/compiler/xla/service/conditional_code_motion.h
+++ b/tensorflow/compiler/xla/service/conditional_code_motion.h
@@ -23,7 +23,11 @@ limitations under the License.
 
 namespace xla {
 
-// HLO pass that moves identical ops out of conditional.
+// ConditionalCodeMotion specializes in hoisting/rematerializing
+// unconditional converts in the default mode.
+// When pursue_full_conditional_code_motion_ is set to true, the
+// full HLO pass moves identical ops out of a conditional in addition to moving
+// converts.
 // - The definition of identical are the shape of the operands are identical
 // and their properties are identical.
 // - Currently, only some types of instructions is supported.
@@ -35,13 +39,18 @@ class ConditionalCodeMotion : public HloModulePass {
  public:
   // If is_layout_sensitive is true, then the hoist process preserves layout
   // during identical comparison. Otherwise, layout is ignored.
-  explicit ConditionalCodeMotion(bool is_layout_sensitive = true)
-      : is_layout_sensitive_(is_layout_sensitive) {}
+  explicit ConditionalCodeMotion(
+      bool is_layout_sensitive = true,
+      bool pursue_full_conditional_code_motion = false)
+      : is_layout_sensitive_(is_layout_sensitive),
+        pursue_full_conditional_code_motion_(
+            pursue_full_conditional_code_motion) {}
   absl::string_view name() const override { return "conditional-code-motion"; }
   StatusOr<bool> Run(HloModule* module) override;
 
  private:
   const bool is_layout_sensitive_;
+  const bool pursue_full_conditional_code_motion_;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/conditional_code_motion_test.cc b/tensorflow/compiler/xla/service/conditional_code_motion_test.cc
index 4a52303a42a..38b2b515fa0 100644
--- a/tensorflow/compiler/xla/service/conditional_code_motion_test.cc
+++ b/tensorflow/compiler/xla/service/conditional_code_motion_test.cc
@@ -38,7 +38,86 @@ namespace {
 using ConditionalCodeMotionTest = HloTestBase;
 namespace op = xla::testing::opcode_matchers;
 
-TEST_F(ConditionalCodeMotionTest, DoNotMoveConvertOut) {
+TEST_F(ConditionalCodeMotionTest, MoveSubsetTupleOut) {
+  absl::string_view hlo_string =
+      R"(
+HloModule RemoveDotOpOut
+
+on_true {
+  %arg_tuple.1 = (f32[93184,4]{1,0}) parameter(0)
+  %get-tuple-element.1 = f32[93184,4]{1,0} get-tuple-element(%arg_tuple.1), index=0
+  %reshape.8493 = f32[2,512,364]{2,1,0} reshape(f32[93184,4]{1,0} %get-tuple-element.1)
+  %convert.2894 = bf16[2,512,364]{2,1,0} convert(f32[2,512,364]{2,1,0} %reshape.8493)
+  ROOT %tuple.1 = ( bf16[2,512,364]{2,1,0}, f32[2,512,364]{2,1,0}) tuple(%convert.2894, %reshape.8493)
+}
+
+on_false {
+  %arg_tuple.2 = (f32[93184,4]{1,0}) parameter(0)
+  %get-tuple-element.3 = f32[93184,4]{1,0} get-tuple-element(%arg_tuple.2), index=0
+  %reshape.9717 = f32[2,512,364]{2,1,0} reshape(f32[93184,4]{1,0} %get-tuple-element.3)
+  %add = f32[2,512,364]{2,1,0} add(f32[2,512,364]{2,1,0} %reshape.9717, f32[2,512,364]{2,1,0} %reshape.9717)
+  %convert.3604 = bf16[2,512,364]{2,1,0} convert(f32[2,512,364]{2,1,0} %reshape.9717), metadata={op_type="Cast" op_name="gradients/Cast_125_grad/Cast"}
+  ROOT %tuple.2 = (bf16[2,512,364]{2,1,0}, f32[2,512,364]{2,1,0}) tuple(%convert.3604, %add)
+}
+
+ENTRY main {
+  pred.1 = pred[] parameter(0)
+  arg_tuple.11 = (f32[93184,4]{1,0}) parameter(1)
+  arg_tuple.22 = (f32[93184,4]{1,0}) parameter(2)
+  conditional = (bf16[2,512,364]{2,1,0}, f32[2,512,364]{2,1,0}) conditional(pred.1, arg_tuple.11, arg_tuple.22), true_computation=on_true, false_computation=on_false
+  get-first-index = bf16[2,512,364]{2,1,0} get-tuple-element(conditional), index=0
+  get-first-index.2 = f32[2,512,364]{2,1,0} get-tuple-element(conditional), index=1
+  ROOT result = (bf16[2,512,364]{2,1,0}, f32[2,512,364]{2,1,0}) tuple(get-first-index, get-first-index.2)
+}
+)";
+  auto module = ParseAndReturnVerifiedModule(hlo_string).ValueOrDie();
+  ConditionalCodeMotion pass(true, true);
+  ASSERT_TRUE(pass.Run(&*module).ValueOrDie());
+
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, AllOf(op::Tuple(op::Convert(), op::GetTupleElement())));
+}
+
+TEST_F(ConditionalCodeMotionTest, MoveConvertOutConditionalRoot) {
+  absl::string_view hlo_string =
+      R"(
+HloModule RemoveDotOpOut
+
+on_true {
+  %arg_tuple.1 = (f32[93184,4]{1,0}) parameter(0)
+  %get-tuple-element.1 = f32[93184,4]{1,0} get-tuple-element(%arg_tuple.1), index=0
+  %reshape.8493 = f32[2,512,364]{2,1,0} reshape(f32[93184,4]{1,0} %get-tuple-element.1)
+  %add.8493 = f32[2,512,364]{2,1,0} add(f32[2,512,364]{2,1,0} %reshape.8493, f32[2,512,364]{2,1,0} %reshape.8493)
+  %convert.2894 = bf16[2,512,364]{2,1,0} convert(f32[2,512,364]{2,1,0} %add.8493)
+  ROOT %tuple.1 = ( bf16[2,512,364]{2,1,0}) tuple(%convert.2894)
+}
+
+on_false {
+  %arg_tuple.2 = (f32[93184,4]{1,0}) parameter(0)
+  %get-tuple-element.3 = f32[93184,4]{1,0} get-tuple-element(%arg_tuple.2), index=0
+  %reshape.9717 = f32[2,512,364]{2,1,0} reshape(f32[93184,4]{1,0} %get-tuple-element.3)
+  %add.8493 = f32[2,512,364]{2,1,0} add(f32[2,512,364]{2,1,0} %reshape.9717, f32[2,512,364]{2,1,0} %reshape.9717)
+  %sub.8493 = f32[2,512,364]{2,1,0} subtract(f32[2,512,364]{2,1,0} %add.8493, f32[2,512,364]{2,1,0} %reshape.9717)
+  %convert.3604 = bf16[2,512,364]{2,1,0} convert(f32[2,512,364]{2,1,0} %reshape.9717), metadata={op_type="Cast" op_name="gradients/Cast_125_grad/Cast"}
+  ROOT %tuple.2 = (bf16[2,512,364]{2,1,0}) tuple(%convert.3604)
+}
+
+ENTRY main {
+  pred.1 = pred[] parameter(0)
+  arg_tuple.11 = (f32[93184,4]{1,0}) parameter(1)
+  arg_tuple.22 = (f32[93184,4]{1,0}) parameter(2)
+  ROOT conditional = (bf16[2,512,364]{2,1,0}) conditional(pred.1, arg_tuple.11, arg_tuple.22), true_computation=on_true, false_computation=on_false
+}
+)";
+  auto module = ParseAndReturnVerifiedModule(hlo_string).ValueOrDie();
+  ConditionalCodeMotion pass(true, true);
+  ASSERT_TRUE(pass.Run(&*module).ValueOrDie());
+
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, AllOf(op::Tuple(op::Convert())));
+}
+
+TEST_F(ConditionalCodeMotionTest, MoveConvertOut) {
   absl::string_view hlo_string =
       R"(
 HloModule RemoveDotOpOut
@@ -65,12 +144,16 @@ ENTRY main {
   arg_tuple.22 = (f32[93184,4]{1,0}) parameter(2)
   conditional = (bf16[2,512,364]{2,1,0}) conditional(pred.1, arg_tuple.11, arg_tuple.22), true_computation=on_true, false_computation=on_false
   get-first-index = bf16[2,512,364]{2,1,0} get-tuple-element(conditional), index=0
-  ROOT result = (bf16[2,512,364]{2,1,0}) tuple(get-first-index)
+  add.1 = bf16[2,512,364]{2,1,0} add(bf16[2,512,364]{2,1,0} get-first-index, bf16[2,512,364]{2,1,0} get-first-index)
+  ROOT result = (bf16[2,512,364]{2,1,0}) tuple(add.1)
 }
 )";
   auto module = ParseAndReturnVerifiedModule(hlo_string).ValueOrDie();
-  ConditionalCodeMotion pass;
-  ASSERT_FALSE(pass.Run(&*module).ValueOrDie());
+  ConditionalCodeMotion pass(true, true);
+  ASSERT_TRUE(pass.Run(&*module).ValueOrDie());
+
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, AllOf(op::Tuple(op::Add(op::Convert(), op::Convert()))));
 }
 
 TEST_F(ConditionalCodeMotionTest, UserShareOperandCannotBeMoved) {
@@ -123,7 +206,7 @@ ENTRY main {
 }
 )";
   auto module = ParseAndReturnVerifiedModule(hlo_string).ValueOrDie();
-  ConditionalCodeMotion pass;
+  ConditionalCodeMotion pass(true, true);
   ASSERT_TRUE(pass.Run(&*module).ValueOrDie());
 
   const HloInstruction* conditional =
@@ -181,7 +264,7 @@ ENTRY main {
 }
 )";
   auto module = ParseAndReturnVerifiedModule(hlo_string).ValueOrDie();
-  ConditionalCodeMotion pass;
+  ConditionalCodeMotion pass(true, true);
   ASSERT_TRUE(pass.Run(&*module).ValueOrDie());
   const HloInstruction* conditional =
       FindInstruction(module.get(), "conditional");
@@ -245,7 +328,7 @@ ENTRY main {
 }
 )";
   auto module = ParseAndReturnVerifiedModule(hlo_string).ValueOrDie();
-  ConditionalCodeMotion pass;
+  ConditionalCodeMotion pass(true, true);
   ASSERT_TRUE(pass.Run(&*module).ValueOrDie());
 
   const HloInstruction* conditional =
@@ -317,7 +400,7 @@ ENTRY main {
 )";
 
   auto module = ParseAndReturnVerifiedModule(hlo_string).ValueOrDie();
-  ConditionalCodeMotion pass;
+  ConditionalCodeMotion pass(true, true);
   ASSERT_FALSE(pass.Run(&*module).ValueOrDie());
 }
 
@@ -390,7 +473,7 @@ ENTRY main {
 }
 )";
   auto module = ParseAndReturnVerifiedModule(hlo_string).ValueOrDie();
-  ConditionalCodeMotion pass;
+  ConditionalCodeMotion pass(true, true);
   ASSERT_TRUE(pass.Run(&*module).ValueOrDie());
   const HloInstruction* conditional =
       FindInstruction(module.get(), "conditional");
diff --git a/tensorflow/compiler/xla/service/cpu/BUILD b/tensorflow/compiler/xla/service/cpu/BUILD
index ad023efae59..e12c67f2357 100644
--- a/tensorflow/compiler/xla/service/cpu/BUILD
+++ b/tensorflow/compiler/xla/service/cpu/BUILD
@@ -30,6 +30,15 @@ filegroup(
     ]),
 )
 
+cc_library(
+    name = "test_header_helper",
+    testonly = True,
+    hdrs = ["test_target_triple_helper.h"],
+    deps = [
+        "//tensorflow/core:test",
+    ],
+)
+
 filegroup(
     name = "single_threaded_runtime_srcs",
     srcs = [
@@ -1071,6 +1080,7 @@ tf_cc_test(
     deps = [
         ":cpu_compiler",
         ":cpu_transfer_manager",
+        ":test_header_helper",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
index b2416ac2799..31b9fe1c920 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
@@ -277,12 +277,12 @@ Status CpuCompiler::RunHloPassesThroughLayoutAssn(
   pipeline.AddPass<ConvolutionGroupConverter>(
       cost_model,
       /*convert_batch_groups_only=*/false);
-  pipeline.AddPass<ScatterExpander>();
   pipeline.AddPass<BatchNormExpander>(
       /*rewrite_training_op=*/true,
       /*rewrite_inference_op=*/true,
       /*rewrite_grad_op=*/true);
   pipeline.AddPass<DynamicPadder>();
+  pipeline.AddPass<ScatterExpander>();
   pipeline.AddPass<HloGetDimensionSizeRewriter>();
   pipeline.AddPass<ConvCanonicalization>(target_machine_features);
   {
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_executable.cc b/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
index 4552d7b5ba9..d095d220b97 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
@@ -299,12 +299,11 @@ StatusOr<ExecutionOutput> CpuExecutable::ExecuteAsyncOnStream(
       const Shape& expected_shape =
           entry_comp->parameter_instruction(i)->shape();
       const Shape& actual_shape = arguments[i].Buffers().shape();
-      CHECK(
-          Shape::Equal().IgnoreDynamicDimension()(expected_shape, actual_shape))
-          << absl::StreamFormat(
-                 "Shape mismatch on argument %d.  Expected %s, but was %s.", i,
-                 expected_shape.ToString(/*print_layout=*/true),
-                 actual_shape.ToString(/*print_layout=*/true));
+      TF_RET_CHECK(
+          ShapeUtil::DynamicShapeIsCompatible(actual_shape, expected_shape))
+          << "Shape mismatch on argument " << i << ", "
+          << expected_shape.ToString(/*print_layout=*/true) << " vs. "
+          << actual_shape.ToString(/*print_layout=*/true);
     }
   }
 
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion.cc b/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion.cc
index 97e0a518499..9460cc55e10 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion.cc
@@ -94,9 +94,8 @@ bool CpuInstructionFusion::ShouldFuse(HloInstruction* consumer,
 
   // Cost condition: not fuse (simple, expensive producers) and (consumers who
   // reuse operand elements).
-  if (producer->opcode() != HloOpcode::kFusion &&
-      consumer->ReusesOperandElements(operand_index) &&
-      is_expensive(*producer)) {
+  if (producer->opcode() != HloOpcode::kFusion && is_expensive(*producer) &&
+      consumer->ReusesOperandElements(operand_index)) {
     VLOG(2) << "Fusion is not profitable.";
     return false;
   }
diff --git a/tensorflow/compiler/xla/service/cpu/elemental_ir_emitter.h b/tensorflow/compiler/xla/service/cpu/elemental_ir_emitter.h
index 5c9f6677ab3..4c3167e16d9 100644
--- a/tensorflow/compiler/xla/service/cpu/elemental_ir_emitter.h
+++ b/tensorflow/compiler/xla/service/cpu/elemental_ir_emitter.h
@@ -50,6 +50,10 @@ class CpuElementalIrEmitter : public ElementalIrEmitter {
     return ir_emitter_->EmitThreadLocalCall(callee, parameters, name);
   }
 
+  bool fast_min_max() override {
+    return hlo_module_config_.debug_options().xla_cpu_enable_fast_min_max();
+  }
+
   IrEmitter* ir_emitter_;
 };
 
diff --git a/tensorflow/compiler/xla/service/cpu/llvm_ir_runtime.cc b/tensorflow/compiler/xla/service/cpu/llvm_ir_runtime.cc
index f62769cc615..8d9229c1223 100644
--- a/tensorflow/compiler/xla/service/cpu/llvm_ir_runtime.cc
+++ b/tensorflow/compiler/xla/service/cpu/llvm_ir_runtime.cc
@@ -318,7 +318,9 @@ llvm::Value* GenerateVF32Log(llvm::IRBuilder<>* b, llvm::Value* input,
   llvm::Value* is_pos_inf_mask = vsl.FCmpEQMask(input, pos_inf);
 
   // Cut off denormalized stuff.
-  llvm::Value* tmp0 = vsl.Max(min_norm_pos, input);
+  // Always allow fast max because we are checking for the nan above.
+  llvm::Value* tmp0 =
+      vsl.Max(min_norm_pos, input, /*enable_fast_min_max=*/true);
 
   // VectorSupportLibrary (intentionally) can't juggle more than one type at a
   // time so drop down to IRBuilder for this bit.
diff --git a/tensorflow/compiler/xla/service/cpu/test_target_triple_helper.h b/tensorflow/compiler/xla/service/cpu/test_target_triple_helper.h
new file mode 100644
index 00000000000..857de4a8143
--- /dev/null
+++ b/tensorflow/compiler/xla/service/cpu/test_target_triple_helper.h
@@ -0,0 +1,28 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_TEST_TARGET_TRIPLE_HELPER_H_
+#define TENSORFLOW_TEST_TARGET_TRIPLE_HELPER_H_
+
+#if (defined(__powerpc__) || \
+     defined(__ppc__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__))
+static const char kTargetCpuForHost[] = "ppc";
+static const char kTargetTripleForHost[] = "ppc64le-ibm-linux-gnu";
+#else
+static const char kTargetCpuForHost[] = "";
+static const char kTargetTripleForHost[] = "x86_64-pc-linux";
+#endif
+
+#endif
diff --git a/tensorflow/compiler/xla/service/cpu/tests/BUILD b/tensorflow/compiler/xla/service/cpu/tests/BUILD
index 7c17b1339d1..d7c50dce3ca 100644
--- a/tensorflow/compiler/xla/service/cpu/tests/BUILD
+++ b/tensorflow/compiler/xla/service/cpu/tests/BUILD
@@ -41,6 +41,7 @@ tf_cc_test(
     deps = [
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service/cpu:cpu_compiler",
+        "//tensorflow/compiler/xla/service/cpu:test_header_helper",
         "//tensorflow/compiler/xla/service/cpu/tests:cpu_codegen_test",
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
@@ -135,6 +136,7 @@ tf_cc_test(
     deps = [
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service/cpu:cpu_compiler",
+        "//tensorflow/compiler/xla/service/cpu:test_header_helper",
         "//tensorflow/compiler/xla/service/cpu/tests:cpu_codegen_test",
         "//tensorflow/compiler/xla/tests:test_utils",
         "//tensorflow/core:lib",
@@ -215,6 +217,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/service/cpu:cpu_compiler",
+        "//tensorflow/compiler/xla/service/cpu:test_header_helper",
         "//tensorflow/compiler/xla/service/cpu/tests:cpu_codegen_test",
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
@@ -228,6 +231,7 @@ tf_cc_test(
     deps = [
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service/cpu:cpu_compiler",
+        "//tensorflow/compiler/xla/service/cpu:test_header_helper",
         "//tensorflow/compiler/xla/service/cpu/tests:cpu_codegen_test",
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
@@ -241,6 +245,7 @@ tf_cc_test(
     deps = [
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service/cpu:cpu_compiler",
+        "//tensorflow/compiler/xla/service/cpu:test_header_helper",
         "//tensorflow/compiler/xla/service/cpu/tests:cpu_codegen_test",
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
diff --git a/tensorflow/compiler/xla/service/cpu/tests/cpu_dyn_shape_test.cc b/tensorflow/compiler/xla/service/cpu/tests/cpu_dyn_shape_test.cc
index 46249caa0c7..ce892ad34ae 100644
--- a/tensorflow/compiler/xla/service/cpu/tests/cpu_dyn_shape_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/tests/cpu_dyn_shape_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include <memory>
 
 #include "tensorflow/compiler/xla/service/cpu/cpu_compiler.h"
+#include "tensorflow/compiler/xla/service/cpu/test_target_triple_helper.h"
 #include "tensorflow/compiler/xla/service/cpu/tests/cpu_codegen_test.h"
 
 namespace xla {
@@ -46,7 +47,8 @@ TEST_F(CpuDynamicShapeTest, DynamicShapeR2) {
 )";
 
   CpuAotCompilationOptions options{
-      /*triple=*/"x86_64", /*cpu_name=*/"", /*features=*/"",
+      /*triple=*/kTargetTripleForHost, /*cpu_name=*/kTargetCpuForHost,
+      /*features=*/"",
       /*entry_point_name=*/"entry",
       /*relocation_model=*/CpuAotCompilationOptions::RelocationModel::Static};
 
diff --git a/tensorflow/compiler/xla/service/cpu/tests/cpu_eigen_dot_operation_test.cc b/tensorflow/compiler/xla/service/cpu/tests/cpu_eigen_dot_operation_test.cc
index 8b7f843582b..b233ee7df81 100644
--- a/tensorflow/compiler/xla/service/cpu/tests/cpu_eigen_dot_operation_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/tests/cpu_eigen_dot_operation_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 
 #include "absl/strings/str_cat.h"
 #include "tensorflow/compiler/xla/service/cpu/cpu_compiler.h"
+#include "tensorflow/compiler/xla/service/cpu/test_target_triple_helper.h"
 #include "tensorflow/compiler/xla/service/cpu/tests/cpu_codegen_test.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/tests/test_utils.h"
@@ -45,7 +46,8 @@ class CpuEigenDotOperationTest
   void CompileAndCheck(std::unique_ptr<HloComputation> entry_computation,
                        const string& filecheck_lines) {
     CpuAotCompilationOptions options{
-        /*triple=*/"x86_64", /*cpu_name=*/"", /*features=*/"",
+        /*triple=*/kTargetTripleForHost, /*cpu_name=*/kTargetCpuForHost,
+        /*features=*/"",
         /*entry_point_name=*/"entry",
         /*relocation_model=*/CpuAotCompilationOptions::RelocationModel::Static};
 
diff --git a/tensorflow/compiler/xla/service/cpu/tests/cpu_key_value_sort_test.cc b/tensorflow/compiler/xla/service/cpu/tests/cpu_key_value_sort_test.cc
index f3b7b91b2b5..b897f7a1522 100644
--- a/tensorflow/compiler/xla/service/cpu/tests/cpu_key_value_sort_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/tests/cpu_key_value_sort_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include <memory>
 
 #include "tensorflow/compiler/xla/service/cpu/cpu_compiler.h"
+#include "tensorflow/compiler/xla/service/cpu/test_target_triple_helper.h"
 #include "tensorflow/compiler/xla/service/cpu/tests/cpu_codegen_test.h"
 
 namespace xla {
@@ -48,7 +49,8 @@ CHECK: call void @__xla_cpu_runtime_KeyValueSort
   TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(hlo_text));
 
   CpuAotCompilationOptions options{
-      /*triple=*/"x86_64", /*cpu_name=*/"", /*features=*/"",
+      /*triple=*/kTargetTripleForHost, /*cpu_name=*/kTargetCpuForHost,
+      /*features=*/"",
       /*entry_point_name=*/"entry",
       /*relocation_model=*/CpuAotCompilationOptions::RelocationModel::Static};
 
diff --git a/tensorflow/compiler/xla/service/cpu/tests/cpu_literal_caching_test.cc b/tensorflow/compiler/xla/service/cpu/tests/cpu_literal_caching_test.cc
index fc670201125..fb48cfe50e2 100644
--- a/tensorflow/compiler/xla/service/cpu/tests/cpu_literal_caching_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/tests/cpu_literal_caching_test.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/compiler/xla/service/cpu/cpu_compiler.h"
+#include "tensorflow/compiler/xla/service/cpu/test_target_triple_helper.h"
 #include "tensorflow/compiler/xla/service/cpu/tests/cpu_codegen_test.h"
 #include "tensorflow/compiler/xla/service/hlo_parser.h"
 
@@ -64,7 +65,8 @@ CHECK-NOT: private unnamed_addr constant [48 x i8]
                           ParseAndReturnVerifiedModule(hlo_text));
 
   CpuAotCompilationOptions options{
-      /*triple=*/"x86_64-pc-linux", /*cpu_name=*/"", /*features=*/"",
+      /*triple=*/kTargetTripleForHost, /*cpu_name=*/kTargetCpuForHost,
+      /*features=*/"",
       /*entry_point_name=*/"entry",
       /*relocation_model=*/CpuAotCompilationOptions::RelocationModel::Static};
 
@@ -112,7 +114,8 @@ CHECK-NOT: private unnamed_addr constant [8 x i8]
                           ParseAndReturnVerifiedModule(hlo_text));
 
   CpuAotCompilationOptions options{
-      /*triple=*/"x86_64-pc-linux", /*cpu_name=*/"", /*features=*/"",
+      /*triple=*/kTargetTripleForHost, /*cpu_name=*/kTargetCpuForHost,
+      /*features=*/"",
       /*entry_point_name=*/"entry",
       /*relocation_model=*/CpuAotCompilationOptions::RelocationModel::Static};
 
diff --git a/tensorflow/compiler/xla/service/cpu/tests/cpu_outfeed_test.cc b/tensorflow/compiler/xla/service/cpu/tests/cpu_outfeed_test.cc
index ad83c485998..b2ed9bd5f31 100644
--- a/tensorflow/compiler/xla/service/cpu/tests/cpu_outfeed_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/tests/cpu_outfeed_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include <memory>
 
 #include "tensorflow/compiler/xla/service/cpu/cpu_compiler.h"
+#include "tensorflow/compiler/xla/service/cpu/test_target_triple_helper.h"
 #include "tensorflow/compiler/xla/service/cpu/tests/cpu_codegen_test.h"
 
 namespace xla {
@@ -46,7 +47,8 @@ CHECK: private unnamed_addr constant [48 x i8]
   TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(hlo_text));
 
   CpuAotCompilationOptions options{
-      /*triple=*/"x86_64-pc-linux", /*cpu_name=*/"", /*features=*/"",
+      /*triple=*/kTargetTripleForHost, /*cpu_name=*/kTargetCpuForHost,
+      /*features=*/"",
       /*entry_point_name=*/"entry",
       /*relocation_model=*/CpuAotCompilationOptions::RelocationModel::Static};
 
@@ -73,7 +75,8 @@ CHECK: Outfeed
   TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(hlo_text));
 
   CpuAotCompilationOptions options{
-      /*triple=*/"x86_64-pc-linux", /*cpu_name=*/"", /*features=*/"",
+      /*triple=*/kTargetTripleForHost, /*cpu_name=*/kTargetCpuForHost,
+      /*features=*/"",
       /*entry_point_name=*/"entry",
       /*relocation_model=*/CpuAotCompilationOptions::RelocationModel::Static};
 
diff --git a/tensorflow/compiler/xla/service/cpu/vector_support_library.cc b/tensorflow/compiler/xla/service/cpu/vector_support_library.cc
index b15ad1e162d..0d2eab9fd42 100644
--- a/tensorflow/compiler/xla/service/cpu/vector_support_library.cc
+++ b/tensorflow/compiler/xla/service/cpu/vector_support_library.cc
@@ -80,10 +80,11 @@ llvm::Value* VectorSupportLibrary::Sub(llvm::Value* lhs, llvm::Value* rhs) {
   return b()->CreateFSub(lhs, rhs);
 }
 
-llvm::Value* VectorSupportLibrary::Max(llvm::Value* lhs, llvm::Value* rhs) {
+llvm::Value* VectorSupportLibrary::Max(llvm::Value* lhs, llvm::Value* rhs,
+                                       bool enable_fast_min_max) {
   AssertCorrectTypes({lhs, rhs});
   if (scalar_type_->isFloatingPointTy()) {
-    return llvm_ir::EmitFloatMax(lhs, rhs, b_);
+    return llvm_ir::EmitFloatMax(lhs, rhs, b_, enable_fast_min_max);
   } else {
     LOG(FATAL) << "Max for integers is unimplemented";
   }
diff --git a/tensorflow/compiler/xla/service/cpu/vector_support_library.h b/tensorflow/compiler/xla/service/cpu/vector_support_library.h
index cbbc4d7bf34..f1a0b0a4406 100644
--- a/tensorflow/compiler/xla/service/cpu/vector_support_library.h
+++ b/tensorflow/compiler/xla/service/cpu/vector_support_library.h
@@ -78,9 +78,11 @@ class VectorSupportLibrary {
   llvm::Value* Sub(llvm::Value* lhs, const llvm::APFloat& rhs) {
     return Sub(lhs, GetConstantFloat(lhs->getType(), rhs));
   }
-  llvm::Value* Max(llvm::Value* lhs, llvm::Value* rhs);
-  llvm::Value* Max(const llvm::APFloat& lhs, llvm::Value* rhs) {
-    return Max(GetConstantFloat(rhs->getType(), lhs), rhs);
+  llvm::Value* Max(llvm::Value* lhs, llvm::Value* rhs,
+                   bool enable_fast_min_max);
+  llvm::Value* Max(const llvm::APFloat& lhs, llvm::Value* rhs,
+                   bool enable_fast_min_max) {
+    return Max(GetConstantFloat(rhs->getType(), lhs), rhs, enable_fast_min_max);
   }
   llvm::Value* Div(llvm::Value* lhs, llvm::Value* rhs);
 
diff --git a/tensorflow/compiler/xla/service/cpu/vectorized_reduce_with_no_vector_registers_test.cc b/tensorflow/compiler/xla/service/cpu/vectorized_reduce_with_no_vector_registers_test.cc
index 754885d8744..bf38450a386 100644
--- a/tensorflow/compiler/xla/service/cpu/vectorized_reduce_with_no_vector_registers_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/vectorized_reduce_with_no_vector_registers_test.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Target/TargetMachine.h"
 #include "tensorflow/compiler/xla/service/cpu/cpu_compiler.h"
+#include "tensorflow/compiler/xla/service/cpu/test_target_triple_helper.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 
@@ -74,8 +75,9 @@ ENTRY main {
   module_group->push_back(std::move(hlo_module));
 
   // Check that the GetTargetVectorRegisterByteSize is itself working.
-  TF_ASSERT_OK_AND_ASSIGN(unsigned vector_register_byte_size_for_x86_64,
-                          GetTargetVectorRegisterByteSize("x86_64-pc-linux"));
+  TF_ASSERT_OK_AND_ASSIGN(
+      unsigned vector_register_byte_size_for_x86_64,
+      GetTargetVectorRegisterByteSize(kTargetTripleForHost));
   ASSERT_EQ(vector_register_byte_size_for_x86_64, 16);
 
   std::string triple = "i686-none-android";
diff --git a/tensorflow/compiler/xla/service/elemental_ir_emitter.cc b/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
index e4097b0c06f..4b6c30cadc4 100644
--- a/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
@@ -1313,12 +1313,12 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitComplexBinaryOp(
 
 llvm::Value* ElementalIrEmitter::EmitFloatMax(llvm::Value* lhs_value,
                                               llvm::Value* rhs_value) {
-  return llvm_ir::EmitFloatMax(lhs_value, rhs_value, b_);
+  return llvm_ir::EmitFloatMax(lhs_value, rhs_value, b_, fast_min_max());
 }
 
 llvm::Value* ElementalIrEmitter::EmitFloatMin(llvm::Value* lhs_value,
                                               llvm::Value* rhs_value) {
-  return llvm_ir::EmitFloatMin(lhs_value, rhs_value, b_);
+  return llvm_ir::EmitFloatMin(lhs_value, rhs_value, b_, fast_min_max());
 }
 
 StatusOr<llvm::Value*> ElementalIrEmitter::EmitLog(PrimitiveType prim_type,
diff --git a/tensorflow/compiler/xla/service/elemental_ir_emitter.h b/tensorflow/compiler/xla/service/elemental_ir_emitter.h
index e39d2dd99ec..365e3f56b85 100644
--- a/tensorflow/compiler/xla/service/elemental_ir_emitter.h
+++ b/tensorflow/compiler/xla/service/elemental_ir_emitter.h
@@ -245,6 +245,8 @@ class ElementalIrEmitter : public IrBuilderMixin<ElementalIrEmitter> {
       std::vector<llvm_ir::ElementGenerator> initial_value_generators,
       const llvm_ir::IrArray::Index& index);
 
+  virtual bool fast_min_max() = 0;
+
   llvm::IRBuilder<>* const b_;
 
   llvm::Module* module_;
diff --git a/tensorflow/compiler/xla/service/executable.cc b/tensorflow/compiler/xla/service/executable.cc
index ebf7cc440dd..61ce6200a28 100644
--- a/tensorflow/compiler/xla/service/executable.cc
+++ b/tensorflow/compiler/xla/service/executable.cc
@@ -28,10 +28,57 @@ limitations under the License.
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/lib/strings/proto_serialization.h"
 #include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/errors.h"
 #include "tensorflow/stream_executor/device_description.h"
 
 namespace xla {
 
+ExecutionInput::~ExecutionInput() {
+  for (auto& index : unowned_indices_) {
+    auto buffer = buffers_.mutable_element(index)->Release();
+    if (buffer) {
+      buffer->Release();
+    }
+  }
+}
+
+Status ExecutionInput::SetDynamicShape(Shape dynamic_shape) {
+  const Shape& input_shape = shape();
+  if (!ShapeUtil::DynamicShapeIsCompatible(input_shape, dynamic_shape)) {
+    return tensorflow::errors::InvalidArgument(
+        "Cannot set dynamic shape: ", input_shape.DebugString(), " vs. ",
+        dynamic_shape.DebugString());
+  }
+  dynamic_shape_ = absl::make_unique<Shape>(std::move(dynamic_shape));
+  return Status::OK();
+}
+
+void ExecutionInput::SetUnownedBuffer(const ShapeIndex& index,
+                                      MaybeOwningDeviceMemory buffer) {
+  *buffers_.mutable_element(index) = std::move(buffer);
+  unowned_indices_.insert(index);
+}
+
+xla::StatusOr<xla::ShapedBuffer> ExecutionInput::ToShapedBuffer(
+    se::DeviceMemoryAllocator* allocator, int device_ordinal) const {
+  const Shape& input_shape = shape();
+  xla::ShapedBuffer shaped_buffer(input_shape, input_shape,
+                                  allocator->platform(), device_ordinal);
+  for (const auto& index_buffer : Buffers()) {
+    const tensorflow::se::OwningDeviceMemory* mem =
+        index_buffer.second.AsOwningDeviceMemory();
+    if (mem != nullptr && (mem->allocator() != allocator ||
+                           mem->device_ordinal() != device_ordinal)) {
+      return tensorflow::errors::InvalidArgument(
+          "Device buffer at index ", index_buffer.first.ToString(),
+          " has mismatching allocator/device");
+    }
+    shaped_buffer.set_buffer(index_buffer.second.AsDeviceMemoryBase(),
+                             index_buffer.first);
+  }
+  return std::move(shaped_buffer);
+}
+
 StatusOr<ScopedShapedBuffer> Executable::ExecuteOnStream(
     const ServiceExecutableRunOptions* run_options,
     absl::Span<const ShapedBuffer* const> arguments,
diff --git a/tensorflow/compiler/xla/service/executable.h b/tensorflow/compiler/xla/service/executable.h
index 2c979662d24..6881f6dd68a 100644
--- a/tensorflow/compiler/xla/service/executable.h
+++ b/tensorflow/compiler/xla/service/executable.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_XLA_SERVICE_EXECUTABLE_H_
 
 #include <memory>
+#include <set>
 #include <utility>
 #include <vector>
 
@@ -65,31 +66,32 @@ class ExecutionInput {
       : buffers_(std::move(buffers)) {}
   ExecutionInput(ExecutionInput&&) = default;
 
-  ~ExecutionInput() {
-    for (auto& index : unowned_indices_) {
-      auto buffer = buffers_.mutable_element(index)->Release();
-      if (buffer) {
-        buffer->Release();
-      }
-    }
-  }
+  ~ExecutionInput();
 
   ExecutionInput& operator=(ExecutionInput&&) = default;
 
-  const Shape& shape() const { return buffers_.shape(); }
+  const Shape& shape() const {
+    return dynamic_shape_ != nullptr ? *dynamic_shape_ : buffers_.shape();
+  }
+
+  Status SetDynamicShape(Shape dynamic_shape);
+
+  xla::StatusOr<xla::ShapedBuffer> ToShapedBuffer(
+      se::DeviceMemoryAllocator* allocator, int device_ordinal) const;
 
   void SetBuffer(const ShapeIndex& index, MaybeOwningDeviceMemory buffer) {
     *buffers_.mutable_element(index) = std::move(buffer);
   }
 
   void SetUnownedBuffer(const ShapeIndex& index,
-                        MaybeOwningDeviceMemory buffer) {
-    *buffers_.mutable_element(index) = std::move(buffer);
-    unowned_indices_.push_back(index);
-  }
+                        MaybeOwningDeviceMemory buffer);
 
   void SetUnownedIndex(const ShapeIndex& index) {
-    unowned_indices_.push_back(index);
+    unowned_indices_.insert(index);
+  }
+
+  void ClearUnownedIndex(const ShapeIndex& index) {
+    unowned_indices_.erase(index);
   }
 
   const ShapeTree<MaybeOwningDeviceMemory>& Buffers() const { return buffers_; }
@@ -106,9 +108,10 @@ class ExecutionInput {
 
  private:
   ShapeTree<MaybeOwningDeviceMemory> buffers_;
-  // (Unordered) set of indices of buffers that should be returned to the
-  // caller if an error occurs when enqueuing the computation.
-  std::vector<ShapeIndex> unowned_indices_;
+  // Set of indices of buffers that should be returned to the caller if an error
+  // occurs when enqueuing the computation.
+  std::set<ShapeIndex> unowned_indices_;
+  std::unique_ptr<Shape> dynamic_shape_;
 };
 
 // ExecutionOutput encapsulates the output buffers of a execution and the
@@ -145,7 +148,6 @@ class ExecutionOutput {
     to_be_released_.push_back(std::move(mem));
   }
 
-
   // Should be called once it is known that the execute operation succeeded,
   // before returning the ExecutionOutput to the caller.
   ExecutionOutput& Commit() {
diff --git a/tensorflow/compiler/xla/service/gpu/BUILD b/tensorflow/compiler/xla/service/gpu/BUILD
index 0eb82128159..472d2117a2c 100644
--- a/tensorflow/compiler/xla/service/gpu/BUILD
+++ b/tensorflow/compiler/xla/service/gpu/BUILD
@@ -1174,6 +1174,7 @@ cc_library(
         ":reduction_degenerate_dim_remover",
         ":reduction_dimension_grouper",
         ":reduction_layout_normalizer",
+        ":reduction_splitter",
         ":stream_assignment",
         ":stream_executor_util",
         ":target_constants",
@@ -1819,6 +1820,33 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "reduction_splitter",
+    srcs = ["reduction_splitter.cc"],
+    hdrs = ["reduction_splitter.h"],
+    deps = [
+        ":ir_emission_utils",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service:hlo_pass",
+    ],
+)
+
+tf_cc_test(
+    name = "reduction_splitter_test",
+    srcs = ["reduction_splitter_test.cc"],
+    deps = [
+        ":reduction_splitter",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla:test_helpers",
+        "//tensorflow/compiler/xla/service:hlo_matchers",
+        "//tensorflow/compiler/xla/service:hlo_parser",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+    ],
+)
+
 cc_library(
     name = "reduction_layout_normalizer",
     srcs = ["reduction_layout_normalizer.cc"],
diff --git a/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.h b/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.h
index a3056b1ddad..766a4c84df5 100644
--- a/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.h
+++ b/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.h
@@ -96,6 +96,10 @@ class GpuElementalIrEmitter : public ElementalIrEmitter {
 
   llvm::Value* EmitThreadId() override;
 
+  bool fast_min_max() override {
+    return hlo_module_config_.debug_options().xla_gpu_enable_fast_min_max();
+  }
+
  private:
   // Emits IR for op, which must have opcode kPower.
   StatusOr<llvm::Value*> EmitPowerOp(const HloInstruction* op,
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
index cddbee92874..156cb112285 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
@@ -65,6 +65,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/gpu/reduction_degenerate_dim_remover.h"
 #include "tensorflow/compiler/xla/service/gpu/reduction_dimension_grouper.h"
 #include "tensorflow/compiler/xla/service/gpu/reduction_layout_normalizer.h"
+#include "tensorflow/compiler/xla/service/gpu/reduction_splitter.h"
 #include "tensorflow/compiler/xla/service/gpu/stream_assignment.h"
 #include "tensorflow/compiler/xla/service/gpu/stream_executor_util.h"
 #include "tensorflow/compiler/xla/service/gpu/target_constants.h"
@@ -371,6 +372,7 @@ Status GpuCompiler::OptimizeHloPostLayoutAssignment(
   pipeline.AddPass<ReductionDegenerateDimRemover>();
   pipeline.AddPass<ReductionLayoutNormalizer>();
   pipeline.AddPass<ReductionDimensionGrouper>();
+  pipeline.AddPass<HloPassFix<ReductionSplitter>>();
 
   // The LayoutAssignment pass may leave behind kCopy instructions which are
   // duplicate or NOPs, so remove them with algebraic simplification and CSE.
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc b/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc
index b97aa3651c6..01bcf456f75 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc
@@ -226,6 +226,11 @@ bool IsReductionFromOrToContiguousDimensions(const HloInstruction& reduce) {
       dims_to_keep.push_back(dim);
     }
   }
+
+  // We support fast codegen for three cases:
+  // 1) Row reduction: (K, R)
+  // 2) Column reduction: (K, R, K)
+  // 3) "Batched" row reduction: (R, K, R)
   if (!LayoutUtil::AreDimensionsConsecutive(input->shape().layout(),
                                             dims_to_keep) &&
       !LayoutUtil::AreDimensionsConsecutive(input->shape().layout(),
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
index 937a0ea5bbc..74aad5f5bd5 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
@@ -1418,6 +1418,13 @@ Status IrEmitterUnnested::HandleSort(HloInstruction* sort) {
 
   AddThunkToThunkSequence(
       absl::make_unique<SequentialThunk>(std::move(thunks), sort));
+  if (sort->operand_count() > 1) {
+    // Emit the tuple as part of the last stage of sorting.
+    // We are currently in the block sorted.in_bounds.after.
+    b_.SetInsertPoint(b_.GetInsertBlock()->getTerminator());
+    llvm_ir::EmitTuple(GetIrArray(*sort, *sort),
+                       ConstructIrArrayForOutputs(*sort), &b_);
+  }
   return Status::OK();
 }
 
diff --git a/tensorflow/compiler/xla/service/gpu/kernel_thunk.h b/tensorflow/compiler/xla/service/gpu/kernel_thunk.h
index 88351881f3a..25acabb239b 100644
--- a/tensorflow/compiler/xla/service/gpu/kernel_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/kernel_thunk.h
@@ -77,8 +77,6 @@ class KernelThunk : public Thunk {
   // Will be set by IrEmitterUnnested.
   LaunchDimensions launch_dimensions_;
 
-  // Describes how to load this kernel. ExecuteOnStream reuses this loader
-  // specification for all executions.
   mutable tensorflow::mutex mutex_;
 
   // Loaded kernels for each `StreamExecutor`.  Requires pointer stability of
diff --git a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc
index 497dcda4361..d2126a8d17d 100644
--- a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc
+++ b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc
@@ -492,9 +492,10 @@ void NVPTXBackendInit(const HloModuleConfig& hlo_module_config) {
 
 namespace nvptx {
 
-StatusOr<string> CompileToPtx(llvm::Module* module, GpuVersion gpu_version,
-                              const HloModuleConfig& hlo_module_config,
-                              const string& libdevice_dir_path) {
+StatusOr<string> CompileToPtx(
+    llvm::Module* module, GpuVersion gpu_version,
+    const HloModuleConfig& hlo_module_config, const string& libdevice_dir_path,
+    std::function<void(llvm::TargetMachine*)> configure_target) {
   static absl::once_flag backend_init_flag;
   absl::call_once(backend_init_flag, NVPTXBackendInit, hlo_module_config);
 
@@ -525,6 +526,11 @@ StatusOr<string> CompileToPtx(llvm::Module* module, GpuVersion gpu_version,
     std::unique_ptr<llvm::TargetMachine> target_machine = NVPTXGetTargetMachine(
         default_target_triple, *compute_capability, hlo_module_config);
 
+    // Apply target machine configuration from call-back if available.
+    if (configure_target) {
+      configure_target(target_machine.get());
+    }
+
     // Link with libdevice, and optimize the LLVM module.
     TF_RETURN_IF_ERROR(LinkAndOptimizeModule(
         module, gpu_version, hlo_module_config, libdevice_dir_path,
diff --git a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.h b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.h
index 526621de7a5..33ef9280c7a 100644
--- a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.h
+++ b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.h
@@ -22,6 +22,7 @@ limitations under the License.
 
 #include "absl/strings/string_view.h"
 #include "llvm/IR/Module.h"
+#include "llvm/Target/TargetMachine.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_types.h"
 #include "tensorflow/compiler/xla/service/hlo_module_config.h"
 #include "tensorflow/compiler/xla/statusor.h"
@@ -38,9 +39,10 @@ namespace nvptx {
 // The Compile.* interfaces each create their own llvm::LLVMContext objects for
 // thread safety, but note that LLVM's multithreaded support is very
 // preliminary; multithreaded use is not recommended at this time.
-StatusOr<string> CompileToPtx(llvm::Module* module, GpuVersion gpu_version,
-                              const HloModuleConfig& hlo_module_config,
-                              const string& libdevice_dir_path);
+StatusOr<string> CompileToPtx(
+    llvm::Module* module, GpuVersion gpu_version,
+    const HloModuleConfig& hlo_module_config, const string& libdevice_dir_path,
+    std::function<void(llvm::TargetMachine*)> configure_target = nullptr);
 }  // namespace nvptx
 
 namespace amdgpu {
diff --git a/tensorflow/compiler/xla/service/gpu/reduction_splitter.cc b/tensorflow/compiler/xla/service/gpu/reduction_splitter.cc
new file mode 100644
index 00000000000..b68213ec35f
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/reduction_splitter.cc
@@ -0,0 +1,117 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/gpu/reduction_splitter.h"
+
+#include <algorithm>
+
+#include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
+#include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
+#include "tensorflow/compiler/xla/service/hlo_instructions.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+
+namespace xla {
+namespace gpu {
+
+class ReductionSplitterVisitor : public DfsHloRewriteVisitor {
+ public:
+  Status HandleReduce(HloInstruction *reduce) override {
+    VLOG(4) << "Input: " << reduce->ToString();
+
+    // Reductions with contiguous dimensions are lowered to efficient code. No
+    // need to split such ops.
+    if (IsReductionFromOrToContiguousDimensions(*reduce)) {
+      return Status::OK();
+    }
+    if (reduce->dimensions().size() < 2) {
+      return Status::OK();
+    }
+    if (!reduce->shape().IsArray()) {
+      // TODO(cheshire): Handle variadic reduction.
+      return Status::OK();
+    }
+
+    HloInstruction *operand = reduce->mutable_operand(0);
+    const Shape &shape = operand->shape();
+    CHECK(shape == LayoutUtil::GetWithDefaultLayout(shape))
+        << "Default layout should be enforced on reduction operand";
+    // Verify that contiguous dimensions have been grouped by the
+    // ReductionDimensionGrouper pass.
+    for (int64 i = 0; i < reduce->dimensions().size(); ++i) {
+      for (int64 j = i + 1; j < reduce->dimensions().size(); ++j) {
+        CHECK(abs(reduce->dimensions(i) - reduce->dimensions(j)) > 1)
+            << "Reduction dimensions must not be consecutive";
+      }
+    }
+
+    // The reduce op has non-contiguous dimensions. Look for the dimension with
+    // the largest shape dimension. Reducing along this dimension first will
+    // reduce the output size most effectively.
+    int64 max_shape_dim = 0;
+    int64 max_reduce_dim = 0;
+    const auto &input_shape = reduce->operand(0)->shape();
+    for (int64 i = 0; i < reduce->dimensions().size(); ++i) {
+      if (input_shape.dimensions(reduce->dimensions(i)) > max_shape_dim) {
+        max_reduce_dim = reduce->dimensions(i);
+        max_shape_dim = input_shape.dimensions(max_reduce_dim);
+      }
+    }
+    // TODO(tjoerg): Run microbenchmarks to tune this threshold.
+    if (max_shape_dim < 128) {
+      return Status::OK();
+    }
+
+    // Split the reduction into a pre-reduction and a final reduction.
+    VLOG(3) << "Splitting reduction " << reduce->name() << " at dimension "
+            << max_reduce_dim;
+    std::vector<int64> pre_reduce_dims;
+    pre_reduce_dims.push_back(max_reduce_dim);
+    std::vector<int64> pre_reduce_shape_dims(input_shape.dimensions().begin(),
+                                             input_shape.dimensions().end());
+    pre_reduce_shape_dims.erase(pre_reduce_shape_dims.begin() + max_reduce_dim);
+    Shape pre_reduce_shape = ShapeUtil::MakeShape(
+        reduce->shape().element_type(), pre_reduce_shape_dims);
+    std::unique_ptr<HloInstruction> pre_reduce = HloInstruction::CreateReduce(
+        pre_reduce_shape, reduce->mutable_operand(0),
+        reduce->mutable_operand(1), pre_reduce_dims, reduce->to_apply());
+    pre_reduce->set_metadata(reduce->metadata());
+
+    std::vector<int64> final_reduce_dims(reduce->dimensions().begin(),
+                                         reduce->dimensions().end());
+    final_reduce_dims.erase(
+        std::remove(final_reduce_dims.begin(), final_reduce_dims.end(),
+                    max_reduce_dim),
+        final_reduce_dims.end());
+    for (int64 i = 0; i < final_reduce_dims.size(); ++i) {
+      if (final_reduce_dims[i] > max_reduce_dim) {
+        final_reduce_dims[i]--;
+      }
+    }
+    std::unique_ptr<HloInstruction> final_reduce = HloInstruction::CreateReduce(
+        reduce->shape(),
+        reduce->parent()->AddInstruction(std::move(pre_reduce)),
+        reduce->mutable_operand(1), final_reduce_dims, reduce->to_apply());
+    return ReplaceWithNewInstruction(reduce, std::move(final_reduce));
+  }
+};
+
+StatusOr<bool> ReductionSplitter::Run(HloModule *module) {
+  TF_ASSIGN_OR_RETURN(bool changed,
+                      ReductionSplitterVisitor().RunOnModule(module));
+  return changed;
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/reduction_splitter.h b/tensorflow/compiler/xla/service/gpu/reduction_splitter.h
new file mode 100644
index 00000000000..f161b579eb8
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/reduction_splitter.h
@@ -0,0 +1,49 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_REDUCTION_SPLITTER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_REDUCTION_SPLITTER_H_
+
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+
+namespace xla {
+namespace gpu {
+
+// Splits a reduce op into two consecutive reduce ops if
+// * the reduce dimensions are not contiguous and
+// * at least one reduce dimension is large (i.e. corresponds to a large input
+//   shape dimension).
+//
+// Reductions with non-contiguous dimensions are emitted as simple element-wise
+// loops. This is inefficient when reducing large input shape dimensions.
+// Splitting such reductions allows using more efficient reduction emitters.
+//
+// This pass splits reduce ops into two consecutive reduce ops. Run it to a
+// fixpoint to split reduce ops along multiple large dimensions.
+//
+// Precondition: ReductionDimensionGrouper has been run and adjacent reduce
+// dimentsions have been grouped. Reduction layouts have been normalized.
+
+class ReductionSplitter : public HloModulePass {
+ public:
+  absl::string_view name() const override { return "reduction-splitter"; }
+
+  StatusOr<bool> Run(HloModule* module) override;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_REDUCTION_SPLITTER_H_
diff --git a/tensorflow/compiler/xla/service/gpu/reduction_splitter_test.cc b/tensorflow/compiler/xla/service/gpu/reduction_splitter_test.cc
new file mode 100644
index 00000000000..1be55b84204
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/reduction_splitter_test.cc
@@ -0,0 +1,140 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/gpu/reduction_splitter.h"
+
+#include "tensorflow/compiler/xla/service/hlo_matchers.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/test_helpers.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+
+namespace xla {
+namespace gpu {
+namespace {
+
+namespace op = xla::testing::opcode_matchers;
+
+class ReductionSplitterTest : public HloTestBase {};
+
+TEST_F(ReductionSplitterTest, SplitReductionAtDimensionTwo) {
+  auto module = ParseAndReturnVerifiedModule(R"(
+  HloModule test
+
+  add_computation {
+    x = f32[] parameter(0)
+    y = f32[] parameter(1)
+    ROOT add = f32[] add(x, y)
+  }
+
+  ENTRY entry_computation {
+    param_0 = f16[6,16,512,64]{3,2,1,0} parameter(0)
+    transpose.1781 = f16[6,512,16,64]{3,1,2,0} transpose(param_0), dimensions={0,2,1,3}
+    convert.6986 = f32[6,512,16,64]{3,1,2,0} convert(transpose.1781)
+    bitcast.2136 = f32[6,16,512,64]{3,2,1,0} bitcast(convert.6986)
+    constant_11111 = f32[] constant(0)
+    ROOT reduce.982 = f32[16,64]{1,0} reduce(bitcast.2136, constant_11111), dimensions={0,2}, to_apply=add_computation
+  }
+  )")
+                    .ValueOrDie();
+  ASSERT_TRUE(ReductionSplitter().Run(module.get()).ValueOrDie());
+  SCOPED_TRACE(module->ToString());
+  const HloInstruction* root_reduction =
+      module->entry_computation()->root_instruction();
+  ASSERT_THAT(root_reduction, op::Reduce(op::Reduce(), op::Constant()));
+
+  auto* pre_reduction = root_reduction->operand(0);
+  EXPECT_THAT(pre_reduction->dimensions(), std::vector<int64>({2}));
+  EXPECT_THAT(pre_reduction->shape(), ShapeUtil::MakeShape(F32, {6, 16, 64}));
+  EXPECT_THAT(root_reduction->dimensions(), std::vector<int64>({0}));
+  EXPECT_THAT(root_reduction->shape(), ShapeUtil::MakeShape(F32, {16, 64}));
+}
+
+TEST_F(ReductionSplitterTest, SplitReductionAtDimensionZero) {
+  auto module = ParseAndReturnVerifiedModule(R"(
+  HloModule test
+
+  add_computation {
+    x = f32[] parameter(0)
+    y = f32[] parameter(1)
+    ROOT add = f32[] add(x, y)
+  }
+
+  ENTRY entry_computation {
+    param_0 = f32[1024,16,512,64,128]{4,3,2,1,0} parameter(0)
+    constant_11111 = f32[] constant(0)
+    ROOT reduce.982 = f32[16,64]{1,0} reduce(param_0, constant_11111), dimensions={2,0,4}, to_apply=add_computation
+  }
+  )")
+                    .ValueOrDie();
+  ASSERT_TRUE(ReductionSplitter().Run(module.get()).ValueOrDie());
+  SCOPED_TRACE(module->ToString());
+  const HloInstruction* root_reduction =
+      module->entry_computation()->root_instruction();
+  ASSERT_THAT(root_reduction, op::Reduce(op::Reduce(), op::Constant()));
+
+  auto* pre_reduction = root_reduction->operand(0);
+  EXPECT_THAT(pre_reduction->dimensions(), std::vector<int64>({0}));
+  EXPECT_THAT(pre_reduction->shape(),
+              ShapeUtil::MakeShape(F32, {16, 512, 64, 128}));
+  EXPECT_THAT(root_reduction->dimensions(), std::vector<int64>({1, 3}));
+  EXPECT_THAT(root_reduction->shape(), ShapeUtil::MakeShape(F32, {16, 64}));
+}
+
+TEST_F(ReductionSplitterTest, DontSplitReductionWithSmallDimensions) {
+  auto module = ParseAndReturnVerifiedModule(R"(
+  HloModule test
+
+  add_computation {
+    x = f32[] parameter(0)
+    y = f32[] parameter(1)
+    ROOT add = f32[] add(x, y)
+  }
+
+  ENTRY entry_computation {
+    param_0 = f32[8,1024,8]{2,1,0} parameter(0)
+    constant_11111 = f32[] constant(0)
+    ROOT reduce.982 = f32[1024]{0} reduce(param_0, constant_11111), dimensions={2,0}, to_apply=add_computation
+  }
+  )")
+                    .ValueOrDie();
+  EXPECT_FALSE(ReductionSplitter().Run(module.get()).ValueOrDie());
+}
+
+TEST_F(ReductionSplitterTest, DontSplitReductionsWithContiguousDimensions) {
+  auto module = ParseAndReturnVerifiedModule(R"(
+  HloModule test
+
+  add_computation {
+    x = f32[] parameter(0)
+    y = f32[] parameter(1)
+    ROOT add = f32[] add(x, y)
+  }
+
+  ENTRY entry_computation {
+    param_0 = f32[128,128,64,128]{3,2,1,0} parameter(0)
+    constant_11111 = f32[] constant(0)
+    // The dimenstions to keep (1 and 2) are contiguous.
+    ROOT reduce.982 = f32[128,64]{1,0} reduce(param_0, constant_11111), dimensions={3,0}, to_apply=add_computation
+  }
+  )")
+                    .ValueOrDie();
+  EXPECT_FALSE(ReductionSplitter().Run(module.get()).ValueOrDie());
+}
+
+}  // namespace
+}  // namespace gpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_ftz_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_ftz_test.cc
index 282f7b24a31..9b58457d129 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/gpu_ftz_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_ftz_test.cc
@@ -92,26 +92,23 @@ TEST_F(GpuFtzDisabledTest, MultiplyFtz) {
 }
 
 // In NVPTX, exp(float) is implemented in libdevice, and consults __nvvm_reflect
-// to determine whether or not ftz is enabled.  The implementation uses two
-// calls to ex2.approx.  When ftz is on, we get two calls to the ftz version;
-// when ftz is off, we get one call to the ftz version and one call to the
-// regular version.
+// to determine whether or not ftz is enabled.
+// The implementation in CUDA 11 uses one ex2.approx.ftz, irrespective of ftz
+// being enabled or not. In previous CUDA versions, there is a leading
+// ex2.approx that does obey the ftz setting.
+// Instead of pattern matching implementation details, it might be better to
+// value-test the actual result instead. TODO(csigg): change to value-test.
 TEST_F(GpuFtzEnabledTest, ExpFtz) {
   CompileAndOptionallyVerifyPtx(CreateUnaryOpModule(HloOpcode::kExp), R"(
     CHECK-NOT: ex2.approx.f32
     CHECK:     ex2.approx.ftz.f32
     CHECK-NOT: ex2.approx.f32
-    CHECK:     ex2.approx.ftz.f32
-    CHECK-NOT: ex2.approx.f32
-    CHECK-NOT: ex2.approx.ftz.f32
   )");
 }
 
 TEST_F(GpuFtzDisabledTest, ExpFtz) {
   CompileAndOptionallyVerifyPtx(CreateUnaryOpModule(HloOpcode::kExp), R"(
-    CHECK-NOT: ex2.approx.f32
-    CHECK-DAG: ex2.approx.ftz.f32
-    CHECK-DAG: ex2.approx.f32
+    CHECK:     ex2.approx.ftz.f32
     CHECK-NOT: ex2.approx.f32
     CHECK-NOT: ex2.approx.ftz.f32
   )");
diff --git a/tensorflow/compiler/xla/service/gpu/tests/reduction_degenerate_dim_remover_test.cc b/tensorflow/compiler/xla/service/gpu/tests/reduction_degenerate_dim_remover_test.cc
index 2c5e704d7c2..92f558ee98d 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/reduction_degenerate_dim_remover_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/reduction_degenerate_dim_remover_test.cc
@@ -37,6 +37,7 @@ class ReductionDegenerateDimRemoverTest : public GpuCodegenTest {
     DebugOptions debug_options = GpuCodegenTest::GetDebugOptionsForTest();
     debug_options.add_xla_disable_hlo_passes("reduction-layout-normalizer");
     debug_options.add_xla_disable_hlo_passes("reduction-dimension-grouper");
+    debug_options.add_xla_disable_hlo_passes("reduction-splitter");
     debug_options.add_xla_disable_hlo_passes("gpu-tree-reduction-rewriter");
     return debug_options;
   }
diff --git a/tensorflow/compiler/xla/service/gpu/tests/reduction_layout_normalizer_test.cc b/tensorflow/compiler/xla/service/gpu/tests/reduction_layout_normalizer_test.cc
index d06385480e5..b65c2842320 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/reduction_layout_normalizer_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/reduction_layout_normalizer_test.cc
@@ -33,6 +33,7 @@ class ReductionLayoutNormalizerTest : public GpuCodegenTest {
   DebugOptions GetDebugOptionsForTest() override {
     DebugOptions debug_options = GpuCodegenTest::GetDebugOptionsForTest();
     debug_options.add_xla_disable_hlo_passes("reduction-dimension-grouper");
+    debug_options.add_xla_disable_hlo_passes("reduction-splitter");
     debug_options.add_xla_disable_hlo_passes("layout-assignment");
     debug_options.add_xla_disable_hlo_passes("gpu-tree-reduction-rewriter");
     return debug_options;
diff --git a/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc b/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc
index f19882c9347..a46d20d5808 100644
--- a/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc
+++ b/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc
@@ -1007,6 +1007,8 @@ void HloDataflowAnalysis::OptimizePhiValues() {
             HloValue::Id phi_id = values[0]->id();
             HloValue::Id new_id = phi_graph_.FindOptimizedValue(phi_id);
             if (new_id != phi_id) {
+              VLOG(1) << "Replacing " << values[0]->ToString() << " with "
+                      << GetValue(new_id).ToString();
               value_set->Clear();
               const HloValue& new_value = GetValue(new_id);
               value_set->AddValue(&new_value);
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator.cc b/tensorflow/compiler/xla/service/hlo_evaluator.cc
index 3dd6d82784f..ae8f49df4b4 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator.cc
+++ b/tensorflow/compiler/xla/service/hlo_evaluator.cc
@@ -274,6 +274,13 @@ StatusOr<Literal> HloEvaluator::Evaluate(
   engine_.seed(seed_);
 
   TF_RETURN_IF_ERROR(computation.Accept(this));
+
+  if (VLOG_IS_ON(100)) {
+    for (const HloInstruction* instr : computation.instructions()) {
+      VLOG(100) << instr->name() << " = " << GetEvaluatedLiteralFor(instr);
+    }
+  }
+
   return GetEvaluatedLiteralFor(computation.root_instruction()).Clone();
 }
 
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.cc b/tensorflow/compiler/xla/service/hlo_instruction.cc
index cfa21b95dd2..6de76c1cc63 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.cc
+++ b/tensorflow/compiler/xla/service/hlo_instruction.cc
@@ -3908,6 +3908,10 @@ const string& HloInstruction::outfeed_config() const {
   return Cast<HloOutfeedInstruction>(this)->outfeed_config();
 }
 
+void HloInstruction::set_outfeed_config(const string& config) {
+  return Cast<HloOutfeedInstruction>(this)->set_outfeed_config(config);
+}
+
 const std::vector<ReplicaGroup>& HloInstruction::replica_groups() const {
   return Cast<HloCollectiveInstruction>(this)->replica_groups();
 }
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.h b/tensorflow/compiler/xla/service/hlo_instruction.h
index 7a5d506b681..f3bb59ff625 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.h
+++ b/tensorflow/compiler/xla/service/hlo_instruction.h
@@ -1755,6 +1755,9 @@ class HloInstruction {
   // Returns the config for the Outfeed instruction.
   const string& outfeed_config() const;
 
+  // Delegates to HloOutfeedInstruction::set_outfeed_config.
+  void set_outfeed_config(const string& config);
+
   // Returns the shape for the Outfeed instruction.
   const Shape& outfeed_shape() const;
 
diff --git a/tensorflow/compiler/xla/service/hlo_instructions.h b/tensorflow/compiler/xla/service/hlo_instructions.h
index 6da01dc088e..f5a963ef063 100644
--- a/tensorflow/compiler/xla/service/hlo_instructions.h
+++ b/tensorflow/compiler/xla/service/hlo_instructions.h
@@ -1141,6 +1141,7 @@ class HloOutfeedInstruction : public HloInstruction {
   const Shape& outfeed_shape() const { return outfeed_shape_; }
   // Returns the config for the Outfeed instruction.
   const string& outfeed_config() const { return outfeed_config_; }
+  void set_outfeed_config(const string& config) { outfeed_config_ = config; }
   // Returns a serialized representation of this instruction.
   HloInstructionProto ToProto() const override;
 
diff --git a/tensorflow/compiler/xla/service/hlo_phi_graph.cc b/tensorflow/compiler/xla/service/hlo_phi_graph.cc
index 9b69771dab2..a2cba3d1bff 100644
--- a/tensorflow/compiler/xla/service/hlo_phi_graph.cc
+++ b/tensorflow/compiler/xla/service/hlo_phi_graph.cc
@@ -20,10 +20,11 @@ limitations under the License.
 namespace xla {
 HloValue::Id PhiGraph::GetOptimizedId(const HloValue& value) {
   Node* node = value_id_to_node_[value.id()];
+  CHECK(!node->mark_as_dead);
   return node->value_id;
 }
 
-// Returns true if the input to a hlo value is the same as `inputs`.
+// Returns true if the inputs to a hlo value are the same as `inputs`.
 bool PhiGraph::InputsEqualTo(const HloValue& value,
                              absl::Span<const HloValue* const> inputs) {
   auto iter = value_id_to_node_.find(value.id());
@@ -42,6 +43,7 @@ bool PhiGraph::InputsEqualTo(const HloValue& value,
 HloValue::Id PhiGraph::FindOptimizedValue(const HloValue::Id id) {
   auto iter = value_id_to_node_.find(id);
   CHECK(iter != value_id_to_node_.end());
+  CHECK(!iter->second->mark_as_dead);
   return iter->second->value_id;
 }
 
@@ -66,6 +68,17 @@ PhiGraph::Node* PhiGraph::CreateOrReuseNode(const HloValue& value) {
 void PhiGraph::ReplaceNodeWith(PhiGraph::Node* node, PhiGraph::Node* replace) {
   // Update users.
   CHECK(node->is_phi);
+  if (node->mark_as_dead) {
+    // The node has already been replaced with another.
+    return;
+  }
+  if (replace->mark_as_dead) {
+    // The node we are placing with has already been replaced with another node.
+    auto iter = value_id_to_node_.find(replace->value_id);
+    CHECK(iter != value_id_to_node_.end());
+    return ReplaceNodeWith(node, iter->second);
+  }
+  CHECK(!replace->mark_as_dead);
   for (Node* user : node->users) {
     absl::c_replace(user->operands, node, replace);
   }
@@ -74,6 +87,7 @@ void PhiGraph::ReplaceNodeWith(PhiGraph::Node* node, PhiGraph::Node* replace) {
   for (Node* operand : node->operands) {
     absl::c_replace(operand->users, node, replace);
   }
+
   for (HloValue::Id value_id : node_to_value_id_[node]) {
     CHECK(value_id_to_node_.contains(value_id));
     value_id_to_node_[value_id] = replace;
@@ -115,6 +129,8 @@ std::string PhiGraph::ToString() {
 }
 
 void PhiGraph::Optimize() {
+  VLOG(2) << "Optimizing phi graph:";
+  XLA_VLOG_LINES(2, ToString());
   // Set up users for each node.
   for (auto& node : node_storage_) {
     for (Node* input : node->operands) {
@@ -141,6 +157,8 @@ void PhiGraph::Optimize() {
 
       Node* node_ptr = node.get();
 
+      VLOG(2) << "Optimizing: " << node_ptr->value_id;
+
       CHECK_GE(node_ptr->operands.size(), 1);
 
       // Remove self-referencing ids from users and operands.
@@ -167,6 +185,9 @@ void PhiGraph::Optimize() {
           [&](Node* elem) { return elem == node_ptr->operands[0]; });
 
       if (all_inputs_are_same) {
+        VLOG(1) << "All inputs to node " << node_ptr->value_id
+                << " are the same, replacing it with "
+                << node_ptr->operands[0]->value_id;
         ReplaceNodeWith(node_ptr, node_ptr->operands[0]);
         changed = true;
         continue;
@@ -223,6 +244,8 @@ void PhiGraph::Optimize() {
             CHECK_EQ(node, non_phi);
             continue;
           }
+          VLOG(1) << "Replace node " << node->value_id
+                  << " in the closure with node " << non_phi->value_id;
           ReplaceNodeWith(node, non_phi);
           changed = true;
         }
diff --git a/tensorflow/compiler/xla/service/hlo_phi_graph.h b/tensorflow/compiler/xla/service/hlo_phi_graph.h
index a0eb994438e..ca0d5c5009c 100644
--- a/tensorflow/compiler/xla/service/hlo_phi_graph.h
+++ b/tensorflow/compiler/xla/service/hlo_phi_graph.h
@@ -90,7 +90,7 @@ class PhiGraph {
   // to that phi.
   absl::flat_hash_map<Node*, std::vector<HloValue::Id>> node_to_value_id_;
 
-  // A mapping between a HloValue and node in the phi graph.
+  // A mapping from a HloValue to node in the phi graph.
   absl::flat_hash_map<HloValue::Id, Node*> value_id_to_node_;
   std::vector<std::unique_ptr<Node>> node_storage_;
 };
diff --git a/tensorflow/compiler/xla/service/hlo_phi_graph_test.cc b/tensorflow/compiler/xla/service/hlo_phi_graph_test.cc
index 41f0454fe55..ee7300b160b 100644
--- a/tensorflow/compiler/xla/service/hlo_phi_graph_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_phi_graph_test.cc
@@ -82,5 +82,30 @@ TEST_F(PhiGraphTest, CircularPhi) {
   EXPECT_EQ(D.id(), phi_graph.FindOptimizedValue(C.id()));
 }
 
+TEST_F(PhiGraphTest, NestedPhiReduction) {
+  // def A = phi(B, C)
+  // def B = phi(C, E)
+  // def C = phi(A, B)
+  // def D = non-phi
+  // def E = Phi(D, D)
+  // 1. Replace E with D
+  // 2. Replace A B and C with E/D
+  PhiGraph phi_graph;
+  HloValue A = NewHloValue(true);
+  HloValue B = NewHloValue(true);
+  HloValue C = NewHloValue(true);
+  HloValue D = NewHloValue(false);
+  HloValue E = NewHloValue(true);
+  phi_graph.RegisterPhi(A, {&B, &C});
+  phi_graph.RegisterPhi(B, {&E, &C});
+  phi_graph.RegisterPhi(C, {&A, &B});
+  phi_graph.RegisterPhi(E, {&D, &D});
+  phi_graph.Optimize();
+  EXPECT_EQ(D.id(), phi_graph.FindOptimizedValue(A.id()));
+  EXPECT_EQ(D.id(), phi_graph.FindOptimizedValue(B.id()));
+  EXPECT_EQ(D.id(), phi_graph.FindOptimizedValue(C.id()));
+  EXPECT_EQ(D.id(), phi_graph.FindOptimizedValue(E.id()));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_verifier.cc b/tensorflow/compiler/xla/service/hlo_verifier.cc
index 4661b8fd9e3..d8baebd6fdd 100644
--- a/tensorflow/compiler/xla/service/hlo_verifier.cc
+++ b/tensorflow/compiler/xla/service/hlo_verifier.cc
@@ -1123,7 +1123,8 @@ Status ShapeVerifier::HandleGetDimensionSize(HloInstruction* get_size) {
 Status ShapeVerifier::HandleSetDimensionSize(HloInstruction* set_size) {
   return CheckShape(set_size,
                     ShapeInference::InferSetDimensionSizeShape(
-                        set_size->operand(0)->shape(), set_size->dimension()));
+                        set_size->operand(0)->shape(),
+                        set_size->operand(1)->shape(), set_size->dimension()));
 }
 
 Status ShapeVerifier::CheckShape(const HloInstruction* instruction,
diff --git a/tensorflow/compiler/xla/service/interpreter/executor.cc b/tensorflow/compiler/xla/service/interpreter/executor.cc
index a2e46ba2afe..616fd031c47 100644
--- a/tensorflow/compiler/xla/service/interpreter/executor.cc
+++ b/tensorflow/compiler/xla/service/interpreter/executor.cc
@@ -35,7 +35,6 @@ XlaInterpreterExecutor::~XlaInterpreterExecutor() {}
 
 DeviceMemoryBase XlaInterpreterExecutor::Allocate(uint64 size,
                                                   int64 memory_space) {
-  CHECK_EQ(memory_space, 0);
   return DeviceMemoryBase(new char[size], size);
 }
 
diff --git a/tensorflow/compiler/xla/service/layout_assignment.cc b/tensorflow/compiler/xla/service/layout_assignment.cc
index 307fd82069e..a35ba140e86 100644
--- a/tensorflow/compiler/xla/service/layout_assignment.cc
+++ b/tensorflow/compiler/xla/service/layout_assignment.cc
@@ -951,12 +951,7 @@ Status LayoutAssignment::CheckLayouts(HloModule* module) {
                 if (!Shape::Equal()
                          .IgnoreDynamicDimension()
                          .MinorToMajorOnlyInLayout()(instruction_subshape,
-                                                     buffer->shape()) &&
-                    // TODO(mingyao): Use explicit linear layout tiling to
-                    // detect and allow special bitcast.
-                    instruction->opcode() != HloOpcode::kBitcast &&
-                    instruction->opcode() != HloOpcode::kGetTupleElement &&
-                    instruction->opcode() != HloOpcode::kTuple) {
+                                                     buffer->shape())) {
                   return InternalError(
                       "Layout of instruction %s at index {%s} does not match "
                       "source LogicalBuffer %s: %s vs %s",
@@ -1803,6 +1798,13 @@ Status LayoutAssignment::ClearComputationLayouts(HloComputation* computation) {
   // potential bugs in the layout assignment pass that may accidentally use the
   // existing layout.
   for (HloInstruction* instruction : computation->instructions()) {
+    if (instruction->opcode() == HloOpcode::kBitcast) {
+      // bitcasts are inherently layout sensitive and so a bitcast instruction
+      // present in the IR before layout assignment is a bug.
+      return InternalError(
+          "Unexpected bitcast operation seen during layout assignment: %s.",
+          instruction->ToString());
+    }
     // Some instructions carry mandatory layouts in their shape.
     if (instruction->opcode() != HloOpcode::kInfeed &&
         !IsLayoutConstrainedCustomCall(instruction) &&
diff --git a/tensorflow/compiler/xla/service/layout_assignment_test.cc b/tensorflow/compiler/xla/service/layout_assignment_test.cc
index 6e575247e6b..304a80c7a52 100644
--- a/tensorflow/compiler/xla/service/layout_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/layout_assignment_test.cc
@@ -814,6 +814,27 @@ TEST_F(LayoutAssignmentTest, ConditionalAsymmetricLayout) {
   EXPECT_THAT(false_result->opcode(), HloOpcode::kCopy);
 }
 
+TEST_F(LayoutAssignmentTest, InternalErrorOnBitcast) {
+  auto builder = HloComputation::Builder(TestName());
+  auto constant0 = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR2WithLayout<float>(
+          {{1.0, 2.0}, {3.0, 4.0}}, LayoutUtil::MakeLayout({0, 1}))));
+  builder.AddInstruction(
+      HloInstruction::CreateBitcast(constant0->shape(), constant0));
+  auto m = CreateNewVerifiedModule();
+  m->AddEntryComputation(builder.Build());
+
+  ComputationLayout computation_layout(
+      m->entry_computation()->ComputeProgramShape());
+  LayoutAssignment layout_assignment(&computation_layout);
+  Status error_status = layout_assignment.Run(m.get()).status();
+  EXPECT_FALSE(error_status.ok());
+  EXPECT_THAT(
+      error_status.error_message(),
+      ::testing::HasSubstr(
+          "Unexpected bitcast operation seen during layout assignment"));
+}
+
 TEST_F(LayoutAssignmentTest, ChannelLayoutMismatch) {
   // Pin non matching layouts to parameter and root.
   const char* module_str = R"(
diff --git a/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc b/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc
index e4ca08f972b..b01ae2efe43 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc
@@ -91,10 +91,8 @@ llvm::CallInst* EmitCallToIntrinsic(
 }
 
 llvm::Value* EmitFloatMax(llvm::Value* lhs_value, llvm::Value* rhs_value,
-                          llvm::IRBuilder<>* b) {
-  // TODO(tpopp): Pass this information down from the HLO's ModuleConfig.
-  if (b->getFastMathFlags().noNaNs() ||
-      GetDebugOptionsFromFlags().xla_cpu_enable_fast_min_max()) {
+                          llvm::IRBuilder<>* b, bool enable_fast_min_max) {
+  if (b->getFastMathFlags().noNaNs() || enable_fast_min_max) {
     auto cmp = b->CreateFCmpUGE(lhs_value, rhs_value);
     return b->CreateSelect(cmp, lhs_value, rhs_value);
   } else {
@@ -106,10 +104,8 @@ llvm::Value* EmitFloatMax(llvm::Value* lhs_value, llvm::Value* rhs_value,
 }
 
 llvm::Value* EmitFloatMin(llvm::Value* lhs_value, llvm::Value* rhs_value,
-                          llvm::IRBuilder<>* b) {
-  // TODO(tpopp): Pass this information down from the HLO's ModuleConfig.
-  if (b->getFastMathFlags().noNaNs() ||
-      GetDebugOptionsFromFlags().xla_cpu_enable_fast_min_max()) {
+                          llvm::IRBuilder<>* b, bool enable_fast_min_max) {
+  if (b->getFastMathFlags().noNaNs() || enable_fast_min_max) {
     auto cmp = b->CreateFCmpULE(lhs_value, rhs_value);
     return b->CreateSelect(cmp, lhs_value, rhs_value);
   } else {
diff --git a/tensorflow/compiler/xla/service/llvm_ir/llvm_util.h b/tensorflow/compiler/xla/service/llvm_ir/llvm_util.h
index 691898011ed..642965b6470 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/llvm_util.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/llvm_util.h
@@ -108,12 +108,12 @@ llvm::CallInst* EmitCallToIntrinsic(
 // Emit float max. Emit maxnum intrinsic is fast math is disabled, or
 // fcmp+select otherwise
 llvm::Value* EmitFloatMax(llvm::Value* lhs_value, llvm::Value* rhs_value,
-                          llvm::IRBuilder<>* b);
+                          llvm::IRBuilder<>* b, bool enable_fast_min_max);
 
 // Emit float min. Emit minnum intrinsic is fast math is disabled, or
 // fcmp+select otherwise
 llvm::Value* EmitFloatMin(llvm::Value* lhs_value, llvm::Value* rhs_value,
-                          llvm::IRBuilder<>* b);
+                          llvm::IRBuilder<>* b, bool enable_fast_min_max);
 
 // Convenience methods for emitting a GEP instruction that indexes into a buffer
 // (1-dimensional array), equivalent to array[index]. The type is automatically
diff --git a/tensorflow/compiler/xla/service/llvm_ir/math_ops.cc b/tensorflow/compiler/xla/service/llvm_ir/math_ops.cc
index 333a2e8f612..0604cb848d2 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/math_ops.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/math_ops.cc
@@ -31,9 +31,13 @@ llvm::Value* EmitFastTanh(llvm::IRBuilder<>* b, llvm::Value* input) {
       b->CreateFCmpOLT(abs_x, llvm::ConstantFP::get(type, kCanUseApprox));
 
   // Clamp the input to [-9, 9].
+  //
+  // To simplify the code base until it's an issue, don't have a slow min/max in
+  // this approximation.
   llvm::Value* input_clamped = llvm_ir::EmitFloatMin(
-      llvm_ir::EmitFloatMax(input, llvm::ConstantFP::get(type, -9.0), b),
-      llvm::ConstantFP::get(type, 9.0), b);
+      llvm_ir::EmitFloatMax(input, llvm::ConstantFP::get(type, -9.0), b,
+                            /*enable_fast_min_max=*/true),
+      llvm::ConstantFP::get(type, 9.0), b, /*enable_fast_min_max=*/true);
 
   static constexpr std::array<float, 7> numerator_coeffs{
       -2.76076847742355e-16f, 2.00018790482477e-13f, -8.60467152213735e-11f,
diff --git a/tensorflow/compiler/xla/service/maybe_owning_device_memory.cc b/tensorflow/compiler/xla/service/maybe_owning_device_memory.cc
index c4bf48bcc00..c7505f5fa4a 100644
--- a/tensorflow/compiler/xla/service/maybe_owning_device_memory.cc
+++ b/tensorflow/compiler/xla/service/maybe_owning_device_memory.cc
@@ -14,7 +14,9 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/compiler/xla/service/maybe_owning_device_memory.h"
+
 #include "absl/types/variant.h"
+
 namespace xla {
 
 tensorflow::se::DeviceMemoryBase MaybeOwningDeviceMemory::AsDeviceMemoryBase()
@@ -38,4 +40,10 @@ MaybeOwningDeviceMemory::Release() {
   return std::move(absl::get<tensorflow::se::OwningDeviceMemory>(mem_));
 }
 
+const tensorflow::se::OwningDeviceMemory*
+MaybeOwningDeviceMemory::AsOwningDeviceMemory() const {
+  return HasOwnership() ? &absl::get<tensorflow::se::OwningDeviceMemory>(mem_)
+                        : nullptr;
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/maybe_owning_device_memory.h b/tensorflow/compiler/xla/service/maybe_owning_device_memory.h
index 7d23d178130..0b56fed0a72 100644
--- a/tensorflow/compiler/xla/service/maybe_owning_device_memory.h
+++ b/tensorflow/compiler/xla/service/maybe_owning_device_memory.h
@@ -57,6 +57,10 @@ class MaybeOwningDeviceMemory {
   // A nullopt is returned if the HasOwnership() == false;
   absl::optional<tensorflow::se::OwningDeviceMemory> Release();
 
+  // If the device memory is owned, returns a pointer to the internal
+  // OwningDeviceMemory, otherwise nullptr is returned.
+  const tensorflow::se::OwningDeviceMemory* AsOwningDeviceMemory() const;
+
   // Returns true if the device_memory has ownership over underlying memory.
   bool HasOwnership() const;
 
diff --git a/tensorflow/compiler/xla/service/memory_space_propagation.cc b/tensorflow/compiler/xla/service/memory_space_propagation.cc
index 80eb4017477..2eb15b14eaf 100644
--- a/tensorflow/compiler/xla/service/memory_space_propagation.cc
+++ b/tensorflow/compiler/xla/service/memory_space_propagation.cc
@@ -29,36 +29,78 @@ StatusOr<bool> MemorySpacePropagation::Run(HloModule* module) {
         // Propagate the operand subshapes.
         for (int operand_idx = 0; operand_idx < instruction->operand_count();
              ++operand_idx) {
-          modified |=
-              PropagateSubshapes(instruction->operand(operand_idx)->shape(),
-                                 instruction->fused_parameter(operand_idx));
+          for (const ShapeUtil::IndexedShape& indexed_shape :
+               ShapeUtil::GetLeafShapes(
+                   instruction->operand(operand_idx)->shape())) {
+            int64 memory_space = indexed_shape.shape.layout().memory_space();
+            modified |= Propagate(indexed_shape.index,
+                                  instruction->fused_parameter(operand_idx),
+                                  memory_space);
+          }
         }
 
         // Propagate output subshapes.
-        modified |= PropagateSubshapes(instruction->shape(),
-                                       instruction->fused_expression_root());
+        for (const ShapeUtil::IndexedShape& indexed_shape :
+             ShapeUtil::GetLeafShapes(instruction->shape())) {
+          int64 memory_space = indexed_shape.shape.layout().memory_space();
+          modified |=
+              Propagate(indexed_shape.index,
+                        instruction->fused_expression_root(), memory_space);
+        }
       }
     }
   }
   return modified;
 }
 
-bool MemorySpacePropagation::PropagateSubshapes(
-    const Shape& caller_shape, const HloInstruction* callee_instruction) const {
+bool MemorySpacePropagation::Propagate(ShapeIndexView index,
+                                       const HloInstruction* callee_instruction,
+                                       int64 memory_space) const {
   bool modified = false;
-  for (const ShapeUtil::IndexedShape& indexed_shape :
-       ShapeUtil::GetLeafShapes(caller_shape)) {
-    int64 memory_space = indexed_shape.shape.layout().memory_space();
-    const HloValue& value = dataflow_analysis_->GetUniqueValueAt(
-        callee_instruction, indexed_shape.index);
+  const HloValue& value = dataflow_analysis_->GetUniqueValueAt(
+      callee_instruction, index.ToShapeIndex());
 
-    for (const HloPosition& position : value.positions()) {
-      Shape* shape = ShapeUtil::GetMutableSubshape(
-          position.instruction->mutable_shape(), position.index);
-      if (shape->layout().memory_space() != memory_space) {
-        shape->mutable_layout()->set_memory_space(memory_space);
-        modified = true;
-      }
+  for (const HloPosition& position : value.positions()) {
+    HloInstruction* instruction = position.instruction;
+    Shape* shape = ShapeUtil::GetMutableSubshape(instruction->mutable_shape(),
+                                                 position.index);
+    if (shape->layout().memory_space() == memory_space) {
+      continue;
+    }
+    shape->mutable_layout()->set_memory_space(memory_space);
+    modified = true;
+
+    // For fusion outputs, propagate the memory space to the fusion root.
+    if (instruction->opcode() == HloOpcode::kFusion) {
+      Propagate(position.index, instruction->fused_expression_root(),
+                memory_space);
+    }
+
+    const HloInstruction* parent_fusion =
+        instruction->parent()->FusionInstruction();
+    // For nested fusion roots, pop one level up and propagate the memory space
+    // to the output of the calling fusion instruction.
+    if (instruction == instruction->parent()->root_instruction() &&
+        parent_fusion->parent()->IsFusionComputation()) {
+      Propagate(position.index, parent_fusion, memory_space);
+    }
+
+    // For nested fusion parameters, pop one level up and propagate the memory
+    // space to the operand of the calling fusion instruction.
+    if (instruction->opcode() == HloOpcode::kParameter &&
+        parent_fusion->parent()->IsFusionComputation()) {
+      const HloInstruction* fusion_operand =
+          parent_fusion->operand(instruction->parameter_number());
+      Propagate(position.index, fusion_operand, memory_space);
+    }
+  }
+
+  for (const HloUse& use : value.uses()) {
+    // For fusion uses, propagate the memory space to the fusion parameter.
+    if (use.instruction->opcode() == HloOpcode::kFusion) {
+      modified |= Propagate(
+          use.operand_index,
+          use.instruction->fused_parameter(use.operand_number), memory_space);
     }
   }
   return modified;
diff --git a/tensorflow/compiler/xla/service/memory_space_propagation.h b/tensorflow/compiler/xla/service/memory_space_propagation.h
index 65a1dfd14a6..510e9e69f79 100644
--- a/tensorflow/compiler/xla/service/memory_space_propagation.h
+++ b/tensorflow/compiler/xla/service/memory_space_propagation.h
@@ -31,12 +31,11 @@ class MemorySpacePropagation : public HloModulePass {
   StatusOr<bool> Run(HloModule* module) override;
 
  private:
-  // Given the caller shape (operand or output) and its corresponding
-  // insturction in the fused computation (parameter or root), propagates the
-  // memory space to all the subshapes in the callee side. Returns true if the
-  // module is modified.
-  bool PropagateSubshapes(const Shape& caller_shape,
-                          const HloInstruction* callee_instruction) const;
+  // Given the shape index (operand or output) and its corresponding instruction
+  // in the fused computation (parameter or root), propagates the memory space
+  // in the callee side. Returns true if the module is modified.
+  bool Propagate(ShapeIndexView index, const HloInstruction* callee_instruction,
+                 int64 memory_space) const;
 
   std::unique_ptr<HloDataflowAnalysis> dataflow_analysis_;
 };
diff --git a/tensorflow/compiler/xla/service/memory_space_propagation_test.cc b/tensorflow/compiler/xla/service/memory_space_propagation_test.cc
index 8d74958f6aa..de45af5a190 100644
--- a/tensorflow/compiler/xla/service/memory_space_propagation_test.cc
+++ b/tensorflow/compiler/xla/service/memory_space_propagation_test.cc
@@ -199,5 +199,153 @@ TEST_F(MemorySpacePropagationTest, TupleOutput) {
   EXPECT_EQ(module->Hash(), ref->Hash());
 }
 
+TEST_F(MemorySpacePropagationTest, NestedInputFusion) {
+  // Tests propagating the memory space to nested fusions on the input side.
+  absl::string_view hlo_string = R"(
+  HloModule NestedFusion
+
+  %bitcast_fusion {
+    %bf_param = s32[3,2]{0,1:T(128)} parameter(0)
+    ROOT %bitcast = s32[6]{0:T(128)} bitcast(%bf_param)
+  }
+
+  %fused_computation {
+    %param_1.3 = s32[1]{0:T(128)} parameter(1)
+    %constant.2 = s32[]{:T(128)} constant(-2147483648)
+    %pad.2 = s32[6]{0:T(128)} pad(s32[1]{0:T(128)} %param_1.3, s32[]{:T(128)} %constant.2), padding=0_5
+    %param_2.3 = s32[5]{0:T(128)} parameter(2)
+    %pad.3 = s32[6]{0:T(128)} pad(s32[5]{0:T(128)} %param_2.3, s32[]{:T(128)} %constant.2), padding=1_0
+    %maximum.1 = s32[6]{0:T(128)} maximum(s32[6]{0:T(128)} %pad.2, s32[6]{0:T(128)} %pad.3)
+    %param_0.1 = s32[3,2]{0,1:T(128)} parameter(0)
+    %fusion.1 = s32[6]{0:T(128)} fusion(%param_0.1), kind=kLoop, calls=bitcast_fusion
+    ROOT %add.0 = s32[6]{0:T(128)} add(s32[6]{0:T(128)} %maximum.1, s32[6]{0:T(128)} %fusion.1)
+  }
+
+  ENTRY %entry {
+    %param0 = s32[3,2]{0,1:T(128)} parameter(0)
+    %param1 = s32[1]{0:T(128)} parameter(1)
+    %param2 = s32[5]{0:T(128)} parameter(2)
+    %arg0 = s32[3,2]{0,1:T(128)S(1)} copy(%param0)
+    %arg1 = s32[1]{0:T(128)} copy(%param1)
+    %arg2 = s32[5]{0:T(128)S(1)} copy(%param2)
+    %fusion = s32[6]{0:T(128)S(1)} fusion(s32[3,2]{0,1:T(128)S(1)} %arg0, s32[1]{0:T(128)} %arg1, s32[5]{0:T(128)S(1)} %arg2), kind=kLoop, calls=%fused_computation
+    ROOT %root = s32[6]{0:T(128)} copy(%fusion)
+  }
+  )";
+  absl::string_view expected_hlo_string = R"(
+  HloModule NestedFusion
+
+  %bitcast_fusion {
+    %bf_param = s32[3,2]{0,1:T(128)S(1)} parameter(0)
+    ROOT %bitcast = s32[6]{0:T(128)S(1)} bitcast(%bf_param)
+  }
+
+  %fused_computation {
+    %param_1.3 = s32[1]{0:T(128)} parameter(1)
+    %constant.2 = s32[]{:T(128)} constant(-2147483648)
+    %pad.2 = s32[6]{0:T(128)} pad(s32[1]{0:T(128)} %param_1.3, s32[]{:T(128)} %constant.2), padding=0_5
+    %param_2.3 = s32[5]{0:T(128)S(1)} parameter(2)
+    %pad.3 = s32[6]{0:T(128)} pad(s32[5]{0:T(128)} %param_2.3, s32[]{:T(128)} %constant.2), padding=1_0
+    %maximum.1 = s32[6]{0:T(128)} maximum(s32[6]{0:T(128)} %pad.2, s32[6]{0:T(128)} %pad.3)
+    %param_0.1 = s32[3,2]{0,1:T(128)S(1)} parameter(0)
+    %fusion.1 = s32[6]{0:T(128)S(1)} fusion(%param_0.1), kind=kLoop, calls=bitcast_fusion
+    ROOT %add.0 = s32[6]{0:T(128)S(1)} add(s32[6]{0:T(128)} %maximum.1, s32[6]{0:T(128)S(1)} %fusion.1)
+  }
+
+  ENTRY %entry {
+    %param0 = s32[3,2]{0,1:T(128)} parameter(0)
+    %param1 = s32[1]{0:T(128)} parameter(1)
+    %param2 = s32[5]{0:T(128)} parameter(2)
+    %arg0 = s32[3,2]{0,1:T(128)S(1)} copy(%param0)
+    %arg1 = s32[1]{0:T(128)} copy(%param1)
+    %arg2 = s32[5]{0:T(128)S(1)} copy(%param2)
+    %fusion = s32[6]{0:T(128)S(1)} fusion(s32[3,2]{0,1:T(128)S(1)} %arg0, s32[1]{0:T(128)} %arg1, s32[5]{0:T(128)S(1)} %arg2), kind=kLoop, calls=%fused_computation
+    ROOT %root = s32[6]{0:T(128)} copy(%fusion)
+  }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnUnverifiedModule(hlo_string));
+  MemorySpacePropagation memory_space_propagation;
+  EXPECT_TRUE(memory_space_propagation.Run(module.get()).ValueOrDie());
+  TF_EXPECT_OK(Verify(module.get()));
+  TF_ASSERT_OK_AND_ASSIGN(auto ref,
+                          ParseAndReturnVerifiedModule(expected_hlo_string));
+  EXPECT_EQ(module->Hash(), ref->Hash());
+}
+
+TEST_F(MemorySpacePropagationTest, NestedOutputFusion) {
+  // Tests propagating the memory space to nested fusions on the output side.
+  absl::string_view hlo_string = R"(
+  HloModule NestedFusion
+
+  %bitcast_fusion {
+    %bf_param = s32[6]{0:T(128)} parameter(0)
+    ROOT %bitcast = s32[3,2]{0,1:T(128)} bitcast(%bf_param)
+  }
+
+  %fused_computation {
+    %param_1.3 = s32[1]{0:T(128)} parameter(1)
+    %constant.2 = s32[]{:T(128)} constant(-2147483648)
+    %pad.2 = s32[6]{0:T(128)} pad(s32[1]{0:T(128)} %param_1.3, s32[]{:T(128)} %constant.2), padding=0_5
+    %param_2.3 = s32[5]{0:T(128)} parameter(2)
+    %pad.3 = s32[6]{0:T(128)} pad(s32[5]{0:T(128)} %param_2.3, s32[]{:T(128)} %constant.2), padding=1_0
+    %maximum.1 = s32[6]{0:T(128)} maximum(s32[6]{0:T(128)} %pad.2, s32[6]{0:T(128)} %pad.3)
+    %param_0.1 = s32[6]{0:T(128)} parameter(0)
+    %add.0 = s32[6]{0:T(128)} add(s32[6]{0:T(128)} %maximum.1, s32[6]{0:T(128)} %param_0.1)
+    ROOT %fusion.1 = s32[3,2]{0,1:T(128)} fusion(%add.0), kind=kLoop, calls=bitcast_fusion
+  }
+
+  ENTRY %entry {
+    %param0 = s32[6]{0:T(128)} parameter(0)
+    %param1 = s32[1]{0:T(128)} parameter(1)
+    %param2 = s32[5]{0:T(128)} parameter(2)
+    %arg0 = s32[6]{0:T(128)S(1)} copy(%param0)
+    %arg1 = s32[1]{0:T(128)} copy(%param1)
+    %arg2 = s32[5]{0:T(128)S(1)} copy(%param2)
+    %fusion = s32[3,2]{0,1:T(128)S(1)} fusion(s32[6]{0:T(128)S(1)} %arg0, s32[1]{0:T(128)} %arg1, s32[5]{0:T(128)S(1)} %arg2), kind=kLoop, calls=%fused_computation
+    ROOT %root = s32[3,2]{0,1:T(128)} copy(%fusion)
+  }
+  )";
+  absl::string_view expected_hlo_string = R"(
+  HloModule NestedFusion
+
+  %bitcast_fusion {
+    %bf_param = s32[6]{0:T(128)S(1)} parameter(0)
+    ROOT %bitcast = s32[3,2]{0,1:T(128)S(1)} bitcast(%bf_param)
+  }
+
+  %fused_computation {
+    %param_1.3 = s32[1]{0:T(128)} parameter(1)
+    %constant.2 = s32[]{:T(128)} constant(-2147483648)
+    %pad.2 = s32[6]{0:T(128)} pad(s32[1]{0:T(128)} %param_1.3, s32[]{:T(128)} %constant.2), padding=0_5
+    %param_2.3 = s32[5]{0:T(128)S(1)} parameter(2)
+    %pad.3 = s32[6]{0:T(128)} pad(s32[5]{0:T(128)} %param_2.3, s32[]{:T(128)} %constant.2), padding=1_0
+    %maximum.1 = s32[6]{0:T(128)} maximum(s32[6]{0:T(128)} %pad.2, s32[6]{0:T(128)} %pad.3)
+    %param_0.1 = s32[6]{0:T(128)S(1)} parameter(0)
+    %add.0 = s32[6]{0:T(128)S(1)} add(s32[6]{0:T(128)} %maximum.1, s32[6]{0:T(128)S(1)} %param_0.1)
+    ROOT %fusion.1 = s32[3,2]{0,1:T(128)S(1)} fusion(%add.0), kind=kLoop, calls=bitcast_fusion
+  }
+
+  ENTRY %entry {
+    %param0 = s32[6]{0:T(128)} parameter(0)
+    %param1 = s32[1]{0:T(128)} parameter(1)
+    %param2 = s32[5]{0:T(128)} parameter(2)
+    %arg0 = s32[6]{0:T(128)S(1)} copy(%param0)
+    %arg1 = s32[1]{0:T(128)} copy(%param1)
+    %arg2 = s32[5]{0:T(128)S(1)} copy(%param2)
+    %fusion = s32[3,2]{0,1:T(128)S(1)} fusion(s32[6]{0:T(128)S(1)} %arg0, s32[1]{0:T(128)} %arg1, s32[5]{0:T(128)S(1)} %arg2), kind=kLoop, calls=%fused_computation
+    ROOT %root = s32[3,2]{0,1:T(128)} copy(%fusion)
+  }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnUnverifiedModule(hlo_string));
+  MemorySpacePropagation memory_space_propagation;
+  EXPECT_TRUE(memory_space_propagation.Run(module.get()).ValueOrDie());
+  TF_EXPECT_OK(Verify(module.get()));
+  TF_ASSERT_OK_AND_ASSIGN(auto ref,
+                          ParseAndReturnVerifiedModule(expected_hlo_string));
+  EXPECT_EQ(module->Hash(), ref->Hash());
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/BUILD b/tensorflow/compiler/xla/service/mlir_gpu/BUILD
index ce45d937424..efe69450846 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/BUILD
+++ b/tensorflow/compiler/xla/service/mlir_gpu/BUILD
@@ -167,6 +167,7 @@ cc_library(
         "//tensorflow/compiler/mlir/xla:lhlo_legalize_to_affine",
         "//tensorflow/compiler/mlir/xla:lhlo_legalize_to_gpu",
         "//tensorflow/compiler/mlir/xla:xla_dialect_registration",
+        "//tensorflow/compiler/mlir/xla:xla_legalize_tanh_to_approximation",
         "//tensorflow/compiler/mlir/xla:xla_legalize_to_linalg",
         "//tensorflow/compiler/xla:status",
         "//tensorflow/compiler/xla:statusor",
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.cc b/tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.cc
index b0cbddcdb92..196ea218ef3 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.cc
+++ b/tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.cc
@@ -301,7 +301,7 @@ struct RewriteKernelSignature
         signalPassFailure();
         return;
       }
-      if (func.getBlocks().size() != 1) {
+      if (!llvm::hasSingleElement(func)) {
         func.emitError() << "surrounding function has more than one block";
         signalPassFailure();
         return;
@@ -505,6 +505,16 @@ Status LowerLHLOToGPU(mlir::ModuleOp module, LowerLHLOToGPUOptions options) {
   // Some basic cleanup.
   pm.addNestedPass<::mlir::FuncOp>(::mlir::createCanonicalizerPass());
   pm.addNestedPass<::mlir::FuncOp>(::mlir::createCSEPass());
+  // Make loops with min bounds into a conditional plus static bounds.
+  // Only do this if we unrolled in the first place.
+  if (!options.unroll_factors.empty()) {
+    pm.addNestedPass<::mlir::FuncOp>(mlir::createForLoopSpecializationPass());
+  }
+  // Approximate of requested.
+  if (options.use_approximations) {
+    pm.addNestedPass<::mlir::FuncOp>(
+        ::mlir::xla::createLegalizeTanhToApproximationPass());
+  }
   // Move scalar operations into the launch to ensure smaller signatures.
   pm.addPass(absl::make_unique<MoveScalarComputationsIntoGpuLaunch>());
   // Take launches to launches with kernels.
@@ -547,7 +557,7 @@ class LowerToNVVMPass
     // TODO(csigg): Remove once we support replacing non-root ops.
     target.addLegalOp<::mlir::gpu::GPUModuleOp, ::mlir::gpu::ModuleEndOp,
                       ::mlir::gpu::YieldOp>();
-    if (failed(mlir::applyFullConversion(m, target, patterns, &converter))) {
+    if (failed(mlir::applyFullConversion(m, target, patterns))) {
       signalPassFailure();
     }
   }
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.h b/tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.h
index 77cf75b9e47..bd633bb06cb 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.h
+++ b/tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.h
@@ -28,6 +28,7 @@ struct LowerLHLOToGPUOptions {
   llvm::ArrayRef<unsigned> unroll_factors = {};
   bool collapse_parallel_loops = true;
   bool rewrite_signature = true;
+  bool use_approximations = false;
 };
 
 Status LowerLHLOToGPU(mlir::ModuleOp module,
diff --git a/tensorflow/compiler/xla/service/multi_output_fusion.cc b/tensorflow/compiler/xla/service/multi_output_fusion.cc
index b95b27d6291..a21cec538d1 100644
--- a/tensorflow/compiler/xla/service/multi_output_fusion.cc
+++ b/tensorflow/compiler/xla/service/multi_output_fusion.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include "absl/container/flat_hash_set.h"
 #include "tensorflow/compiler/xla/debug_options_flags.h"
+#include "tensorflow/compiler/xla/service/hlo_dce.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_reachability.h"
@@ -126,6 +127,10 @@ StatusOr<bool> MultiOutputFusion::Run(HloModule* module) {
   candidates_index_.clear();
   all_fusion_candidates_.clear();
   reachability_.reset();
+  if (changed) {
+    HloDCE dce;
+    TF_RETURN_IF_ERROR(dce.Run(module).status());
+  }
   return changed;
 }
 
diff --git a/tensorflow/compiler/xla/service/shape_inference.cc b/tensorflow/compiler/xla/service/shape_inference.cc
index 75a80747c1d..bb4a38ded1e 100644
--- a/tensorflow/compiler/xla/service/shape_inference.cc
+++ b/tensorflow/compiler/xla/service/shape_inference.cc
@@ -2248,12 +2248,17 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
 }
 
 /* static */ StatusOr<Shape> ShapeInference::InferSetDimensionSizeShape(
-    const Shape& shape, int64 dimension) {
+    const Shape& shape, const Shape& val_shape, int64 dimension) {
   if (dimension < 0 || dimension >= shape.rank()) {
     return InvalidArgument("SetDimensionSize dimension out of bounds: %d.",
                            dimension);
   }
 
+  if (val_shape.rank() != 0 || val_shape.element_type() != S32) {
+    return InvalidArgument(
+        "SetDimensionSize's value has to be S32 scalar, got %s",
+        val_shape.ToString());
+  }
   // TODO(b/119580730): Remove this restriction when very large dimension size
   // is needed.
   if (shape.dimensions(dimension) > std::numeric_limits<int32>::max()) {
diff --git a/tensorflow/compiler/xla/service/shape_inference.h b/tensorflow/compiler/xla/service/shape_inference.h
index 2cb5930d098..d47d96ab52d 100644
--- a/tensorflow/compiler/xla/service/shape_inference.h
+++ b/tensorflow/compiler/xla/service/shape_inference.h
@@ -303,10 +303,13 @@ class ShapeInference {
       const Shape& updates_shape, const ProgramShape& to_apply_shape,
       const ScatterDimensionNumbers& scatter_dim_numbers);
 
+  // Helper that validates the given input shape to GetDimensionSize.
   static StatusOr<Shape> InferGetDimensionSizeShape(const Shape& shape,
                                                     int64 dimension);
 
-  static StatusOr<Shape> InferSetDimensionSizeShape(const Shape& shape,
+  // Helper that validates the given input shape to SetDimensionSize.
+  static StatusOr<Shape> InferSetDimensionSizeShape(const Shape& operand_shape,
+                                                    const Shape& val_shape,
                                                     int64 dimension);
 
   // Helper function for creating a Window proto from user-supplied data.
diff --git a/tensorflow/compiler/xla/service/shape_inference_test.cc b/tensorflow/compiler/xla/service/shape_inference_test.cc
index b5ecf6e583e..916d3ab15c8 100644
--- a/tensorflow/compiler/xla/service/shape_inference_test.cc
+++ b/tensorflow/compiler/xla/service/shape_inference_test.cc
@@ -1365,6 +1365,28 @@ TEST_F(ShapeInferenceTest, DotWithTwoContractingDimsPasses) {
   EXPECT_TRUE(ShapeUtil::Equal(inferred_status.ValueOrDie(), output_shape));
 }
 
+TEST_F(ShapeInferenceTest, ErrorSetDimensionSize) {
+  Shape arg_shape = ShapeUtil::MakeShape(F32, {5, 3});
+  Shape val_shape = ShapeUtil::MakeShape(S32, {1});
+  auto inferred_status = ShapeInference::InferSetDimensionSizeShape(
+      arg_shape, val_shape, /*dimension=*/0);
+
+  EXPECT_FALSE(inferred_status.ok());
+  EXPECT_THAT(inferred_status.status().error_message(),
+              HasSubstr("value has to be S32 scalar"));
+}
+
+TEST_F(ShapeInferenceTest, ErrorSetDimensionSizeWrongType) {
+  Shape arg_shape = ShapeUtil::MakeShape(F32, {5, 3});
+  Shape val_shape = ShapeUtil::MakeShape(U32, {});
+  auto inferred_status = ShapeInference::InferSetDimensionSizeShape(
+      arg_shape, val_shape, /*dimension=*/0);
+
+  EXPECT_FALSE(inferred_status.ok());
+  EXPECT_THAT(inferred_status.status().error_message(),
+              HasSubstr("value has to be S32 scalar"));
+}
+
 // BatchMatMul with different batch dimension sizes fails.
 TEST_F(ShapeInferenceTest, DotWithMismatchedBatchDimSizesFails) {
   Shape lhs_shape = ShapeUtil::MakeShape(F32, {2, 11, 3});
diff --git a/tensorflow/compiler/xla/service/transfer_manager.cc b/tensorflow/compiler/xla/service/transfer_manager.cc
index 4658aebd571..0fd64209152 100644
--- a/tensorflow/compiler/xla/service/transfer_manager.cc
+++ b/tensorflow/compiler/xla/service/transfer_manager.cc
@@ -20,6 +20,7 @@ limitations under the License.
 
 #include "absl/memory/memory.h"
 #include "absl/strings/str_cat.h"
+#include "tensorflow/compiler/xla/service/compiler.h"
 #include "tensorflow/compiler/xla/service/maybe_owning_device_memory.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
@@ -33,6 +34,7 @@ limitations under the License.
 using absl::StrCat;
 
 namespace xla {
+
 /* static */ tensorflow::mutex
     TransferManager::platform_transfer_manager_mutex_(
         tensorflow::LINKER_INITIALIZED);
@@ -200,6 +202,67 @@ void TransferManager::TransferArrayFromDevice(
                                    std::move(done), transfer_metadata);
 }
 
+Status TransferManager::ReadDynamicShapes(se::Stream* stream,
+                                          ShapedBuffer* device_buffer,
+                                          Shape* host_shape,
+                                          Shape* device_shape) {
+  DCHECK(device_shape->is_dynamic());
+  Shape original_device_shape = *device_shape;
+  Shape original_host_shape = *host_shape;
+  TF_RETURN_IF_ERROR(stream->BlockHostUntilDone());
+
+  TF_ASSIGN_OR_RETURN(auto compiler,
+                      Compiler::GetForPlatform(stream->parent()->platform()));
+  TF_RETURN_IF_ERROR(device_buffer->buffers().ForEachMutableElementWithStatus(
+      [&](const ShapeIndex& index, se::DeviceMemoryBase* buffer) {
+        const Shape& buffer_shape =
+            ShapeUtil::GetSubshape(*device_shape, index);
+        if (buffer_shape.IsTuple()) {
+          return Status::OK();
+        }
+        Shape& host_sub_shape =
+            *ShapeUtil::GetMutableSubshape(host_shape, index);
+        Shape& device_sub_shape =
+            *ShapeUtil::GetMutableSubshape(device_shape, index);
+        if (device_sub_shape.is_static()) {
+          return Status::OK();
+        }
+
+        // Read the dynamic shape metadata from the device stream.
+        auto shape_size_fn = compiler->ShapeSizeBytesFunction();
+        Shape buffer_shape_static = ShapeUtil::MakeStaticShape(buffer_shape);
+        const int64 offset = shape_size_fn(buffer_shape_static);
+        int64 metadata_size = shape_size_fn(buffer_shape) - offset;
+        if (metadata_size == 0) {
+          return InvalidArgument("Dynamic shape metadata size should not be 0");
+        }
+        auto buffer_8 = se::DeviceMemory<uint8>(*buffer);
+        auto metadata_buffer =
+            stream->parent()->GetSubBuffer(&buffer_8, offset, metadata_size);
+        TF_ASSIGN_OR_RETURN(
+            auto metadata,
+            TransferArrayFromDevice(
+                stream,
+                ShapeUtil::MakeShape(S32, {buffer_shape.dimensions_size()}),
+                metadata_buffer));
+
+        // Update shape size from metadata.
+        for (int64 i = 0; i < metadata.element_count(); ++i) {
+          host_sub_shape.mutable_dimensions()[i] = metadata.Get<int32>({i});
+          device_sub_shape.mutable_dimensions()[i] = metadata.Get<int32>({i});
+        }
+        return Status::OK();
+      }));
+  host_shape->clear_dynamic_dimensions();
+  device_shape->clear_dynamic_dimensions();
+
+  TF_RET_CHECK(ShapeUtil::DynamicShapeIsCompatible(*device_shape,
+                                                   original_device_shape));
+  TF_RET_CHECK(
+      ShapeUtil::DynamicShapeIsCompatible(*host_shape, original_host_shape));
+  return Status::OK();
+}
+
 /* static */ void TransferManager::RegisterTransferManager(
     se::Platform::Id platform_id,
     TransferManagerCreationFunction creation_function) {
@@ -355,7 +418,9 @@ StatusOr<ScopedShapedBuffer> TransferManager::AllocateScopedShapedBuffer(
         ShapeUtil::GetSubshape(shaped_buffer.on_device_shape(), index);
     TF_ASSIGN_OR_RETURN(auto memory,
                         allocator->Allocate(shaped_buffer.device_ordinal(),
-                                            GetByteSizeRequirement(subshape)));
+                                            GetByteSizeRequirement(subshape),
+                                            /*retry_on_failure=*/true,
+                                            subshape.layout().memory_space()));
     // Move the allocated buffer into the ScopedShapedBuffer, which owns it.
     memory_base = memory.Release();
   }
diff --git a/tensorflow/compiler/xla/service/transfer_manager.h b/tensorflow/compiler/xla/service/transfer_manager.h
index e3f8ceacc42..c0670d26eee 100644
--- a/tensorflow/compiler/xla/service/transfer_manager.h
+++ b/tensorflow/compiler/xla/service/transfer_manager.h
@@ -184,6 +184,15 @@ class TransferManager {
       const se::DeviceMemoryBase& source,
       const TransferMetadata* transfer_metadata = nullptr);
 
+  // Read from a device buffer and update the dynamic dimension sizes of
+  // `host_shape` and `device_shape`. The function takes in bounded dynamic
+  // shapes, and returns static shapes with dynamic shapes updated.
+  // The shape of the buffer also have to be compatible with the host shape and
+  // device shape.
+  virtual Status ReadDynamicShapes(se::Stream* stream,
+                                   ShapedBuffer* device_buffer,
+                                   Shape* host_shape, Shape* device_shape);
+
   // Transfers the given literal into the Infeed interface of the device,
   // using the given executor.
   virtual Status TransferLiteralToInfeed(se::StreamExecutor* executor,
diff --git a/tensorflow/compiler/xla/shape_util.cc b/tensorflow/compiler/xla/shape_util.cc
index ab46e49b181..bce40578132 100644
--- a/tensorflow/compiler/xla/shape_util.cc
+++ b/tensorflow/compiler/xla/shape_util.cc
@@ -1461,7 +1461,7 @@ ShapeUtil::ReshapeLeavesDimensionsUnmodified(
   return shape;
 }
 
-/* static */ bool ShapeUtil::DynamicShapeIsCompatible(
+/* static */ bool ShapeUtil::DynamicArrayShapeIsCompatible(
     const xla::Shape& dynamic_shape, const xla::Shape& bounded_shape) {
   if (dynamic_shape.rank() != bounded_shape.rank()) {
     return false;
@@ -1474,6 +1474,36 @@ ShapeUtil::ReshapeLeavesDimensionsUnmodified(
   return true;
 }
 
+/* static */ bool ShapeUtil::DynamicShapeIsCompatible(
+    const xla::Shape& dynamic_shape, const xla::Shape& bounded_shape) {
+  bool compatible = true;
+  xla::ShapeUtil::ForEachSubshape(dynamic_shape, [&](const Shape& sub_shape,
+                                                     const ShapeIndex& index) {
+    if (compatible) {
+      auto subshape_result = TryGetSubshape(bounded_shape, index);
+      if (subshape_result.ok()) {
+        const Shape* bounded_sub_shape = subshape_result.ConsumeValueOrDie();
+        if (sub_shape.IsTuple()) {
+          if (!bounded_sub_shape->IsTuple()) {
+            compatible = false;
+          }
+        } else {
+          if (bounded_sub_shape->IsTuple()) {
+            compatible = false;
+          } else if (!sub_shape.is_static() &&
+                     !DynamicArrayShapeIsCompatible(sub_shape,
+                                                    *bounded_sub_shape)) {
+            compatible = false;
+          }
+        }
+      } else {
+        compatible = false;
+      }
+    }
+  });
+  return compatible;
+}
+
 /* static */ Shape ShapeUtil::FilterDimensions(
     const std::function<bool(int64)>& p, Shape shape) {
   CHECK(shape.IsArray());
diff --git a/tensorflow/compiler/xla/shape_util.h b/tensorflow/compiler/xla/shape_util.h
index dde56587482..fe1a8acf6e4 100644
--- a/tensorflow/compiler/xla/shape_util.h
+++ b/tensorflow/compiler/xla/shape_util.h
@@ -657,7 +657,11 @@ class ShapeUtil {
                                 Shape shape);
 
   // Returns true if `dynamic_shape` has dimensions that are less-equal to the
-  // "bounded_shape".
+  // "bounded_shape". Shapes must be arrays.
+  static bool DynamicArrayShapeIsCompatible(const xla::Shape& dynamic_shape,
+                                            const xla::Shape& bounded_shape);
+
+  // Same as DynamicArrayShapeIsCompatible() but supports tuples.
   static bool DynamicShapeIsCompatible(const xla::Shape& dynamic_shape,
                                        const xla::Shape& bounded_shape);
 
diff --git a/tensorflow/compiler/xla/tests/BUILD b/tensorflow/compiler/xla/tests/BUILD
index e1863a8a4cf..9b36117602b 100644
--- a/tensorflow/compiler/xla/tests/BUILD
+++ b/tensorflow/compiler/xla/tests/BUILD
@@ -52,16 +52,26 @@ cc_library(
     name = "test_macros_header",
     testonly = True,
     hdrs = ["test_macros.h"],
-    deps = [
-        "//tensorflow/compiler/xla:types",
-        "//tensorflow/core:test",
-        "@com_google_absl//absl/strings",
-    ],
 )
 
 # Generate a test_macros_${BACKEND} library per backend with the proper copts.
 generate_backend_test_macros()
 
+cc_library(
+    name = "manifest_checking_test",
+    testonly = True,
+    srcs = ["manifest_checking_test.cc"],
+    hdrs = ["manifest_checking_test.h"],
+    deps = [
+        ":test_macros_header",
+        "//tensorflow/core:regexp_internal",
+        "//tensorflow/core:test",
+        "//tensorflow/core/platform:logging",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
 cc_library(
     name = "test_utils",
     srcs = ["test_utils.cc"],
@@ -136,6 +146,7 @@ cc_library(
     hdrs = ["hlo_test_base.h"],
     deps = [
         ":literal_test_util",
+        ":manifest_checking_test",
         ":test_utils",
         ":verified_hlo_module",
         "//tensorflow/compiler/xla:debug_options_flags",
@@ -193,6 +204,7 @@ cc_library(
     srcs = ["client_library_test_base.cc"],
     hdrs = ["client_library_test_base.h"],
     deps = [
+        ":manifest_checking_test",
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:array3d",
         "//tensorflow/compiler/xla:array4d",
@@ -273,6 +285,7 @@ cc_library(
     hdrs = ["local_client_test_base.h"],
     deps = [
         ":client_library_test_base",
+        ":manifest_checking_test",
         ":verified_hlo_module",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
diff --git a/tensorflow/compiler/xla/tests/build_defs.bzl b/tensorflow/compiler/xla/tests/build_defs.bzl
index c0c0751b0de..94d870aa2ef 100644
--- a/tensorflow/compiler/xla/tests/build_defs.bzl
+++ b/tensorflow/compiler/xla/tests/build_defs.bzl
@@ -266,11 +266,6 @@ def generate_backend_test_macros(backends = []):
                 "-DXLA_DISABLED_MANIFEST=\\\"%s\\\"" % manifest,
             ],
             deps = [
-                "@com_google_absl//absl/container:flat_hash_map",
-                "@com_google_absl//absl/strings",
-                "//tensorflow/compiler/xla:types",
-                "//tensorflow/core:lib",
-                "//tensorflow/core:regexp_internal",
-                "//tensorflow/core:test",
+                "//tensorflow/core/platform:logging",
             ],
         )
diff --git a/tensorflow/compiler/xla/tests/client_library_test_base.h b/tensorflow/compiler/xla/tests/client_library_test_base.h
index 790497f888e..17bb70bdb42 100644
--- a/tensorflow/compiler/xla/tests/client_library_test_base.h
+++ b/tensorflow/compiler/xla/tests/client_library_test_base.h
@@ -35,6 +35,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
+#include "tensorflow/compiler/xla/tests/manifest_checking_test.h"
 #include "tensorflow/compiler/xla/tests/test_utils.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/bitmap.h"
@@ -62,7 +63,7 @@ std::vector<TestCase> ExpandUseBfloat16(
 }
 
 // A client library test establishes an in-process XLA client connection.
-class ClientLibraryTestBase : public ::testing::Test {
+class ClientLibraryTestBase : public ManifestCheckingTest {
  protected:
   explicit ClientLibraryTestBase(se::Platform* platform = nullptr);
 
diff --git a/tensorflow/compiler/xla/tests/hlo_test_base.h b/tensorflow/compiler/xla/tests/hlo_test_base.h
index 85b1876dd3c..17c2a55ba5b 100644
--- a/tensorflow/compiler/xla/tests/hlo_test_base.h
+++ b/tensorflow/compiler/xla/tests/hlo_test_base.h
@@ -32,6 +32,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/shape_layout.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
+#include "tensorflow/compiler/xla/tests/manifest_checking_test.h"
 #include "tensorflow/compiler/xla/tests/verified_hlo_module.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
@@ -67,7 +68,7 @@ namespace xla {
 //  )
 //
 // For a more detailed example, see "../tests/sample_text_test.cc".
-class HloTestBase : public ::testing::Test {
+class HloTestBase : public ManifestCheckingTest {
  public:
   // Creates a new HLO module for a test. The module created will have
   // TestName() for its name; it will also automatically populate its debug
diff --git a/tensorflow/compiler/xla/tests/local_client_aot_test_helper.cc b/tensorflow/compiler/xla/tests/local_client_aot_test_helper.cc
index 53c0d84854e..3e9a3ec2314 100644
--- a/tensorflow/compiler/xla/tests/local_client_aot_test_helper.cc
+++ b/tensorflow/compiler/xla/tests/local_client_aot_test_helper.cc
@@ -71,6 +71,8 @@ int main(int argc, char** argv) {
     triple_string = "aarch64-none-linux-gnu";
   } else if (target_cpu == "x64_windows") {
     triple_string = "x86_64-pc-windows-msvc19";
+  } else if (target_cpu == "ppc") {
+    triple_string = "ppc64le-ibm-linux-gnu";
   } else if (target_cpu == "local") {
     triple_string = llvm::sys::getDefaultTargetTriple();
   } else {
diff --git a/tensorflow/compiler/xla/tests/local_client_test_base.h b/tensorflow/compiler/xla/tests/local_client_test_base.h
index ea457024618..c1951ad1021 100644
--- a/tensorflow/compiler/xla/tests/local_client_test_base.h
+++ b/tensorflow/compiler/xla/tests/local_client_test_base.h
@@ -32,6 +32,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/transfer_manager.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
+#include "tensorflow/compiler/xla/tests/manifest_checking_test.h"
 #include "tensorflow/compiler/xla/tests/verified_hlo_module.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/platform/mutex.h"
@@ -75,7 +76,7 @@ class TestAllocator : public se::StreamExecutorMemoryAllocator {
 };
 
 // A base class for tests which exercise the LocalClient interface.
-class LocalClientTestBase : public ::testing::Test {
+class LocalClientTestBase : public ManifestCheckingTest {
  protected:
   struct EigenThreadPoolWrapper;
   explicit LocalClientTestBase(se::Platform* platform = nullptr);
diff --git a/tensorflow/compiler/xla/tests/manifest_checking_test.cc b/tensorflow/compiler/xla/tests/manifest_checking_test.cc
new file mode 100644
index 00000000000..ac6204f9df9
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/manifest_checking_test.cc
@@ -0,0 +1,129 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/tests/manifest_checking_test.h"
+
+#include <fstream>
+#include <iterator>
+#include <string>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/strings/ascii.h"
+#include "absl/strings/str_split.h"
+#include "tensorflow/compiler/xla/tests/test_macros.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/regexp.h"
+
+namespace xla {
+
+namespace {
+
+// Mapping from test name; i.e. MyTest.MyTestCase to platforms on which it is
+// disabled - a sequence of regexps.
+using ManifestT = absl::flat_hash_map<std::string, std::vector<std::string>>;
+
+ManifestT ReadManifest() {
+  ManifestT manifest;
+
+  absl::string_view path = absl::NullSafeStringView(*DisabledManifestPath());
+  if (path.empty()) {
+    return manifest;
+  }
+
+  // Note: parens are required to disambiguate vs function decl.
+  std::ifstream file_stream((std::string(path)));
+  std::string contents((std::istreambuf_iterator<char>(file_stream)),
+                       std::istreambuf_iterator<char>());
+
+  std::vector<std::string> lines = absl::StrSplit(contents, '\n');
+  for (std::string& line : lines) {
+    auto comment = line.find("//");
+    if (comment != std::string::npos) {
+      line = line.substr(0, comment);
+    }
+    if (line.empty()) {
+      continue;
+    }
+    absl::StripTrailingAsciiWhitespace(&line);
+    std::vector<std::string> pieces = absl::StrSplit(line, ' ');
+    CHECK_GE(pieces.size(), 1);
+    auto& platforms = manifest[pieces[0]];
+    for (size_t i = 1; i < pieces.size(); ++i) {
+      platforms.push_back(pieces[i]);
+    }
+  }
+  return manifest;
+}
+
+}  // namespace
+
+void ManifestCheckingTest::SetUp() {
+  const testing::TestInfo* test_info =
+      testing::UnitTest::GetInstance()->current_test_info();
+  absl::string_view test_case_name = test_info->test_suite_name();
+  absl::string_view test_name = test_info->name();
+  VLOG(1) << "test_case_name: " << test_case_name;
+  VLOG(1) << "test_name: " << test_name;
+
+  // Remove the type suffix from the test case name.
+  if (const char* type_param = test_info->type_param()) {
+    VLOG(1) << "type_param: " << type_param;
+    size_t last_slash = test_case_name.rfind('/');
+    test_case_name = test_case_name.substr(0, last_slash);
+    VLOG(1) << "test_case_name: " << test_case_name;
+  }
+
+  // Remove the test instantiation name if it is present.
+  auto first_slash = test_case_name.find('/');
+  if (first_slash != test_case_name.npos) {
+    test_case_name.remove_prefix(first_slash + 1);
+    VLOG(1) << "test_case_name: " << test_case_name;
+  }
+
+  ManifestT manifest = ReadManifest();
+
+  // If the test name ends with a slash followed by one or more characters,
+  // strip that off.
+  auto last_slash = test_name.rfind('/');
+  if (last_slash != test_name.npos) {
+    test_name = test_name.substr(0, last_slash);
+    VLOG(1) << "test_name: " << test_name;
+  }
+
+  // First try full match: test_case_name.test_name
+  // If that fails, try to find just the test_case_name; this would disable all
+  // tests in the test case.
+  auto it = manifest.find(absl::StrCat(test_case_name, ".", test_name));
+  if (it == manifest.end()) {
+    it = manifest.find(test_case_name);
+    if (it == manifest.end()) {
+      return;
+    }
+  }
+
+  // Expect a full match vs. one of the platform regexps to disable the test.
+  const std::vector<std::string>& disabled_platforms = it->second;
+  auto platform_string = *TestPlatform();
+  for (const auto& s : disabled_platforms) {
+    if (RE2::FullMatch(/*text=*/platform_string, /*re=*/s)) {
+      GTEST_SKIP();
+      return;
+    }
+  }
+
+  // We didn't hit in the disabled manifest entries, so don't disable it.
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/manifest_checking_test.h b/tensorflow/compiler/xla/tests/manifest_checking_test.h
new file mode 100644
index 00000000000..4f44ed76a3e
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/manifest_checking_test.h
@@ -0,0 +1,35 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_TESTS_MANIFEST_CHECKING_TEST_H_
+#define TENSORFLOW_COMPILER_XLA_TESTS_MANIFEST_CHECKING_TEST_H_
+
+#include "tensorflow/core/platform/test.h"
+
+namespace xla {
+
+// This class allows us to intercept the test name and use an arbitrary
+// heuristic to decide whether the test case should be disabled. We
+// determine whether the test case should be disabled by resolving the (test
+// case name, test name) in a manifest file.
+class ManifestCheckingTest : public ::testing::Test {
+ protected:
+  // This method runs before each test runs.
+  void SetUp() override;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_TESTS_MANIFEST_CHECKING_TEST_H_
diff --git a/tensorflow/compiler/xla/tests/multiple_devices_on_host_test.cc b/tensorflow/compiler/xla/tests/multiple_devices_on_host_test.cc
index 2b19aaded9c..2231fc6feab 100644
--- a/tensorflow/compiler/xla/tests/multiple_devices_on_host_test.cc
+++ b/tensorflow/compiler/xla/tests/multiple_devices_on_host_test.cc
@@ -45,7 +45,8 @@ void CompileAndExecute(
       xla::ClientLibrary::GetXlaService(client->platform())
           ->backend()
           .memory_allocator());
-  StatusOr<ScopedShapedBuffer> result = executable->Run({}, execute_options);
+  StatusOr<ScopedShapedBuffer> result =
+      executable->Run(absl::Span<const ShapedBuffer* const>(), execute_options);
   {
     absl::MutexLock lock(results_mutex);
     results->emplace_back(device_ordinal, std::move(result));
diff --git a/tensorflow/compiler/xla/tests/test_macros.cc b/tensorflow/compiler/xla/tests/test_macros.cc
index dc9ac7b684a..eecbb89b877 100644
--- a/tensorflow/compiler/xla/tests/test_macros.cc
+++ b/tensorflow/compiler/xla/tests/test_macros.cc
@@ -15,93 +15,18 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/tests/test_macros.h"
 
-#include <fstream>
-#include <streambuf>
-#include <string>
-
-#include "absl/container/flat_hash_map.h"
-#include "absl/strings/str_cat.h"
-#include "absl/strings/str_split.h"
 #include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/platform/regexp.h"
 
 namespace xla {
-namespace {
 
-// Mapping from test name; i.e. MyTest.MyTestCase to platforms on which it is
-// disabled - a sequence of regexps.
-using ManifestT = absl::flat_hash_map<string, std::vector<string>>;
-
-ManifestT ReadManifest() {
-  ManifestT manifest;
-
-  string path = XLA_DISABLED_MANIFEST;
-  if (path.empty()) {
-    return manifest;
-  }
-
-  std::ifstream file_stream(path);
-  // Note: parens are required to disambiguate vs function decl.
-  string contents((std::istreambuf_iterator<char>(file_stream)),
-                  std::istreambuf_iterator<char>());
-
-  std::vector<string> lines = absl::StrSplit(contents, '\n');
-  for (string& line : lines) {
-    auto comment = line.find("//");
-    if (comment != string::npos) {
-      line = line.substr(0, comment);
-    }
-    if (line.empty()) {
-      continue;
-    }
-    absl::StripTrailingAsciiWhitespace(&line);
-    std::vector<string> pieces = absl::StrSplit(line, ' ');
-    CHECK_GE(pieces.size(), 1);
-    auto& platforms = manifest[pieces[0]];
-    for (int64 i = 1; i < pieces.size(); ++i) {
-      platforms.push_back(pieces[i]);
-    }
-  }
-  return manifest;
+static bool InitModule() {
+  *DisabledManifestPath() = XLA_DISABLED_MANIFEST;
+  VLOG(1) << "DisabledManifestPath: " << *DisabledManifestPath();
+  *TestPlatform() = XLA_PLATFORM;
+  VLOG(1) << "TestPlatform: " << *TestPlatform();
+  return false;
 }
 
-}  // namespace
-
-std::string PrependDisabledIfIndicated(absl::string_view test_case_name,
-                                       absl::string_view test_name) {
-  ManifestT manifest = ReadManifest();
-
-  // If the test name ends with a slash followed by one or more digits, strip
-  // that off; this is just a shard number, and matching on this would be
-  // unstable even if someone wanted to do it.
-  static LazyRE2 shard_num_pattern = {R"(/\d+$)"};
-  absl::string_view suffix;
-  if (RE2::PartialMatch(test_name, *shard_num_pattern, &suffix)) {
-    test_name.remove_suffix(suffix.size());
-  }
-
-  // First try full match: test_case_name.test_name
-  // If that fails, try to find just the test_case_name; this would disable all
-  // tests in the test case.
-  auto it = manifest.find(absl::StrCat(test_case_name, ".", test_name));
-  if (it == manifest.end()) {
-    it = manifest.find(test_case_name);
-    if (it == manifest.end()) {
-      return std::string(test_name);
-    }
-  }
-
-  // Expect a full match vs. one of the platform regexps to disable the test.
-  const std::vector<string>& disabled_platforms = it->second;
-  string platform_string = XLA_PLATFORM;
-  for (const auto& s : disabled_platforms) {
-    if (RE2::FullMatch(/*text=*/platform_string, /*re=*/s)) {
-      return absl::StrCat("DISABLED_", test_name);
-    }
-  }
-
-  // We didn't hit in the disabled manifest entries, so don't disable it.
-  return std::string(test_name);
-}
+static bool module_initialized = InitModule();
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/test_macros.h b/tensorflow/compiler/xla/tests/test_macros.h
index 33d2dff9721..16cc9ff6feb 100644
--- a/tensorflow/compiler/xla/tests/test_macros.h
+++ b/tensorflow/compiler/xla/tests/test_macros.h
@@ -28,12 +28,6 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_TESTS_TEST_MACROS_H_
 #define TENSORFLOW_COMPILER_XLA_TESTS_TEST_MACROS_H_
 
-#include <string>
-
-#include "absl/strings/string_view.h"
-#include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/core/platform/test.h"
-
 #define DISABLED_ON_CPU(X) X
 #define DISABLED_ON_GPU(X) X
 #define DISABLED_ON_GPU_ROCM(X) X
@@ -79,117 +73,22 @@ limitations under the License.
 
 namespace xla {
 
-// Reads a disabled manifest file to resolve whether test cases should be
-// disabled on a particular platform. For a test that should be disabled,
-// returns DISABLED_ prepended to its name; otherwise returns the test name
-// unmodified.
-std::string PrependDisabledIfIndicated(absl::string_view test_case_name,
-                                       absl::string_view test_name);
+inline const char** DisabledManifestPath() {
+  static const char* disabled_manifest_path = nullptr;
+  return &disabled_manifest_path;
+}
+
+inline const char** TestPlatform() {
+  static const char* test_platform = nullptr;
+  return &test_platform;
+}
 
 }  // namespace xla
 
-// This is the internal "gtest" class instantiation -- it is identical to the
-// GTEST_TEST_ macro, except that we intercept the test name for potential
-// modification by PrependDisabledIfIndicated. That file can use an arbitrary
-// heuristic to decide whether the test case should be disabled, and we
-// determine whether the test case should be disabled by resolving the (test
-// case name, test name) in a manifest file.
-#define XLA_GTEST_TEST_(test_case_name, test_name, parent_class)             \
-  class GTEST_TEST_CLASS_NAME_(test_case_name, test_name)                    \
-      : public parent_class {                                                \
-   public:                                                                   \
-    GTEST_TEST_CLASS_NAME_(test_case_name, test_name)() {}                   \
-                                                                             \
-   private:                                                                  \
-    virtual void TestBody();                                                 \
-    static ::testing::TestInfo* const test_info_ GTEST_ATTRIBUTE_UNUSED_;    \
-    GTEST_DISALLOW_COPY_AND_ASSIGN_(GTEST_TEST_CLASS_NAME_(test_case_name,   \
-                                                           test_name));      \
-  };                                                                         \
-                                                                             \
-  ::testing::TestInfo* const GTEST_TEST_CLASS_NAME_(test_case_name,          \
-                                                    test_name)::test_info_ = \
-      ::testing::RegisterTest(                                               \
-          #test_case_name,                                                   \
-          ::xla::PrependDisabledIfIndicated(#test_case_name, #test_name)     \
-              .c_str(),                                                      \
-          nullptr, nullptr, __FILE__, __LINE__, []() -> parent_class* {      \
-            return new GTEST_TEST_CLASS_NAME_(test_case_name, test_name)();  \
-          });                                                                \
-  void GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::TestBody()
+#define XLA_TEST_F(test_fixture, test_name) TEST_F(test_fixture, test_name)
 
-// This is identical to the TEST_F macro from "gtest", but it potentially
-// disables the test based on an external manifest file, DISABLED_MANIFEST.
-//
-// Per usual, you can see what tests are available via --gunit_list_tests and
-// choose to run tests that have been disabled via the manifest via
-// --gunit_also_run_disabled_tests.
-#define XLA_TEST_F(test_fixture, test_name) \
-  XLA_GTEST_TEST_(test_fixture, test_name, test_fixture)
+#define XLA_TEST_P(test_case_name, test_name) TEST_P(test_case_name, test_name)
 
-// Likewise, this is identical to the TEST_P macro from "gtest", but
-// potentially disables the test based on the DISABLED_MANIFEST file.
-//
-// We have to wrap this in an outer layer so that any DISABLED_ON_* macros will
-// be properly expanded before the stringification occurs.
-#define XLA_TEST_P_IMPL_(test_case_name, test_name)                            \
-  class GTEST_TEST_CLASS_NAME_(test_case_name, test_name)                      \
-      : public test_case_name {                                                \
-   public:                                                                     \
-    GTEST_TEST_CLASS_NAME_(test_case_name, test_name)() {}                     \
-    virtual void TestBody();                                                   \
-                                                                               \
-   private:                                                                    \
-    static int AddToRegistry() {                                               \
-      ::testing::UnitTest::GetInstance()                                       \
-          ->parameterized_test_registry()                                      \
-          .GetTestCasePatternHolder<test_case_name>(                           \
-              #test_case_name,                                                 \
-              ::testing::internal::CodeLocation(__FILE__, __LINE__))           \
-          ->AddTestPattern(                                                    \
-              #test_case_name,                                                 \
-              ::xla::PrependDisabledIfIndicated(#test_case_name, #test_name)   \
-                  .c_str(),                                                    \
-              new ::testing::internal::TestMetaFactory<GTEST_TEST_CLASS_NAME_( \
-                  test_case_name, test_name)>());                              \
-      return 0;                                                                \
-    }                                                                          \
-    static int gtest_registering_dummy_ GTEST_ATTRIBUTE_UNUSED_;               \
-    GTEST_DISALLOW_COPY_AND_ASSIGN_(GTEST_TEST_CLASS_NAME_(test_case_name,     \
-                                                           test_name));        \
-  };                                                                           \
-  int GTEST_TEST_CLASS_NAME_(test_case_name,                                   \
-                             test_name)::gtest_registering_dummy_ =            \
-      GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::AddToRegistry();      \
-  void GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::TestBody()
-
-#define XLA_TEST_P(test_case_name, test_name) \
-  XLA_TEST_P_IMPL_(test_case_name, test_name)
-
-// This is identical to the TEST_F macro from "gtest", but it potentially
-// disables the test based on an external manifest file, DISABLED_MANIFEST.
-#define XLA_TYPED_TEST(CaseName, TestName)                                     \
-  template <typename gtest_TypeParam_>                                         \
-  class GTEST_TEST_CLASS_NAME_(CaseName, TestName)                             \
-      : public CaseName<gtest_TypeParam_> {                                    \
-   private:                                                                    \
-    typedef CaseName<gtest_TypeParam_> TestFixture;                            \
-    typedef gtest_TypeParam_ TypeParam;                                        \
-    virtual void TestBody();                                                   \
-  };                                                                           \
-  bool gtest_##CaseName##_##TestName##_registered_ GTEST_ATTRIBUTE_UNUSED_ =   \
-      ::testing::internal::TypeParameterizedTest<                              \
-          CaseName,                                                            \
-          ::testing::internal::TemplateSel<GTEST_TEST_CLASS_NAME_(CaseName,    \
-                                                                  TestName)>,  \
-          GTEST_TYPE_PARAMS_(CaseName)>::                                      \
-          Register(                                                            \
-              "", ::testing::internal::CodeLocation(__FILE__, __LINE__),       \
-              #CaseName,                                                       \
-              ::xla::PrependDisabledIfIndicated(#CaseName, #TestName).c_str(), \
-              0);                                                              \
-  template <typename gtest_TypeParam_>                                         \
-  void GTEST_TEST_CLASS_NAME_(CaseName,                                        \
-                              TestName)<gtest_TypeParam_>::TestBody()
+#define XLA_TYPED_TEST(CaseName, TestName) TYPED_TEST(CaseName, TestName)
 
 #endif  // TENSORFLOW_COMPILER_XLA_TESTS_TEST_MACROS_H_
diff --git a/tensorflow/compiler/xla/tests/tuple_test.cc b/tensorflow/compiler/xla/tests/tuple_test.cc
index 9ef589e5511..b6ad44497e6 100644
--- a/tensorflow/compiler/xla/tests/tuple_test.cc
+++ b/tensorflow/compiler/xla/tests/tuple_test.cc
@@ -577,5 +577,37 @@ XLA_TEST_F(TupleHloTest,
   EXPECT_TRUE(LiteralTestUtil::Equal(expected, literal));
 }
 
+XLA_TEST_F(TupleHloTest, TupleSelectOfSort) {
+  const char* testcase = R"(
+    HloModule sort
+
+    compare {
+      p.1.lhs = s32[] parameter(2)
+      p.1.rhs = s32[] parameter(3)
+      p.0.lhs = f32[] parameter(0)
+      p.0.rhs = f32[] parameter(1)
+      ROOT lt = pred[] compare(p.0.lhs, p.0.rhs), direction=LT
+    }
+
+    ENTRY Sort {
+      keys = f32[2]{0} iota(), iota_dimension=0
+      values = s32[2]{0} iota(), iota_dimension=0
+      preds = pred[] constant(true)
+      alt = (f32[2], s32[2]) parameter(0)
+
+      sorted = (f32[2]{0}, s32[2]{0}) sort(keys, values), dimensions={0},
+               to_apply=compare
+      ROOT selected = (f32[2], s32[2]) tuple-select(preds, sorted, alt)
+    }
+  )";
+  auto module = ParseAndReturnVerifiedModule(testcase).ValueOrDie();
+  auto param = LiteralUtil::MakeTupleOwned(LiteralUtil::CreateR1<float>({2, 3}),
+                                           LiteralUtil::CreateR1<int>({3, 4}));
+  auto expected = LiteralUtil::MakeTupleOwned(
+      LiteralUtil::CreateR1<float>({0, 1}), LiteralUtil::CreateR1<int>({0, 1}));
+  auto result = ExecuteAndTransfer(std::move(module), {&param});
+  EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/while_test.cc b/tensorflow/compiler/xla/tests/while_test.cc
index d575bbb1f3e..8e8c3605cc7 100644
--- a/tensorflow/compiler/xla/tests/while_test.cc
+++ b/tensorflow/compiler/xla/tests/while_test.cc
@@ -1324,14 +1324,16 @@ void BM_WhileLoop(int num_iters) {
   options.set_allocator(&allocator);
   const int kWarmups = 2;
   for (int i = 0; i < kWarmups; ++i) {
-    auto result = executable->Run({}, options);
+    auto result =
+        executable->Run(absl::Span<const ShapedBuffer* const>(), options);
     ASSERT_TRUE(result.ok());
   }
 
   // Run benchmark.
   tensorflow::testing::StartTiming();
   for (int i = 0; i < num_iters; ++i) {
-    auto result = executable->Run({}, options);
+    auto result =
+        executable->Run(absl::Span<const ShapedBuffer* const>(), options);
     ASSERT_TRUE(result.ok());
   }
 }
diff --git a/tensorflow/compiler/xrt/BUILD b/tensorflow/compiler/xrt/BUILD
index 332c8ff9a14..6a704be4adb 100644
--- a/tensorflow/compiler/xrt/BUILD
+++ b/tensorflow/compiler/xrt/BUILD
@@ -74,6 +74,7 @@ cc_library(
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/service:backend",
         "//tensorflow/compiler/xla/service:executable",
+        "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:shaped_buffer",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
diff --git a/tensorflow/compiler/xrt/kernels/xrt_execute_op.cc b/tensorflow/compiler/xrt/kernels/xrt_execute_op.cc
index 2fc599e42df..bfd48bd1442 100644
--- a/tensorflow/compiler/xrt/kernels/xrt_execute_op.cc
+++ b/tensorflow/compiler/xrt/kernels/xrt_execute_op.cc
@@ -51,12 +51,6 @@ namespace tensorflow {
 
 namespace {
 
-struct InputBuffers {
-  std::vector<RefPtr<XRTTupleAllocation>> input_tuples;
-  std::vector<xla::ShapedBuffer> input_allocations;
-  std::vector<xla::ShapedBuffer*> input_pointers;
-};
-
 uint32 InitialRandomSeed() {
   // Support plumbing the TF seed through to XLA is being worked on.
   // If a user wants deterministic behavior, their best option
@@ -80,75 +74,51 @@ uint32 GetXLARandomSeed() {
   return counter.fetch_add(2);
 }
 
-xla::StatusOr<InputBuffers> GetInputBuffers(
-    XRTMemoryManager::WorkingSet* working_set, xla::Backend* backend,
-    const std::vector<InputCoords>& input_coords, bool release_inputs) {
-  InputBuffers input_buffers;
-  input_buffers.input_tuples.reserve(input_coords.size());
-  input_buffers.input_allocations.reserve(input_coords.size());
-  input_buffers.input_pointers.reserve(input_coords.size());
-  for (size_t i = 0; i < input_coords.size(); ++i) {
-    TF_RETURN_IF_ERROR(
-        working_set->LookupAndPin(backend, input_coords[i].handle));
-    auto tuple = working_set->PinnedTuples().back();
-    input_buffers.input_tuples.emplace_back(tuple);
-    if (release_inputs) {
-      // We are holding a reference to the tuple, so we can safely delete it
-      // from the resource manager here.
-      TF_RETURN_IF_ERROR(
-          working_set->MemoryManager()->Release(input_coords[i].handle));
-      VLOG(2) << "Released allocation handle " << input_coords[i].handle;
-    }
-    if (input_coords[i].index.empty()) {
-      TF_ASSIGN_OR_RETURN(xla::ShapedBuffer shaped_buffer,
-                          tuple->ToShapedBuffer());
-      input_buffers.input_allocations.emplace_back(std::move(shaped_buffer));
-    } else {
-      TF_ASSIGN_OR_RETURN(xla::ShapedBuffer shaped_buffer,
-                          tuple->ToShapedBuffer());
-      TF_ASSIGN_OR_RETURN(xla::ShapedBuffer sub_shaped_buffer,
-                          shaped_buffer.SubShapedBuffer(input_coords[i].index));
-      input_buffers.input_allocations.emplace_back(
-          std::move(sub_shaped_buffer));
-    }
+std::vector<bool> GetDynamicInputInfo(
+    const xla::ComputationLayout& computation_layout) {
+  std::vector<bool> input_is_dynamic;
+  input_is_dynamic.reserve(computation_layout.parameter_count());
+  for (int64 i = 0; i < computation_layout.parameter_count(); ++i) {
+    input_is_dynamic.push_back(
+        !computation_layout.parameter_shape(i).is_static());
   }
-  for (size_t i = 0; i < input_buffers.input_allocations.size(); ++i) {
-    input_buffers.input_pointers.push_back(&input_buffers.input_allocations[i]);
-  }
-  return std::move(input_buffers);
+  return input_is_dynamic;
 }
 
-xla::StatusOr<InputBuffers> GetChainedOpInputs(
+xla::StatusOr<std::vector<RefPtr<XRTTupleAllocation>>> GetInputTuples(
+    xla::LocalExecutable* executable, XRTMemoryManager::WorkingSet* working_set,
+    xla::Backend* backend, const std::vector<InputCoords>& input_coords,
+    bool release_inputs) {
+  const xla::ComputationLayout& computation_layout =
+      executable->executable()->module_config().entry_computation_layout();
+
+  return GetInputTupleAllocations(
+      input_coords, working_set, backend, computation_layout.parameter_count(),
+      [&](int64 i) { return computation_layout.parameter_shape(i); },
+      release_inputs);
+}
+
+xla::StatusOr<std::vector<RefPtr<XRTTupleAllocation>>> GetChainedOpInputTuples(
     const xrt::XRTChainedExecuteOp& op,
     absl::Span<const RefPtr<XRTTupleAllocation>> op_inputs) {
-  InputBuffers input_buffers;
-  input_buffers.input_tuples.reserve(op.inputs_size());
-  input_buffers.input_allocations.reserve(op.inputs_size());
-  input_buffers.input_pointers.reserve(op.inputs_size());
+  std::vector<RefPtr<XRTTupleAllocation>> input_tuples;
+  input_tuples.reserve(op.inputs_size());
   for (int i = 0; i < op.inputs_size(); ++i) {
     auto& input = op.inputs(i);
-    input_buffers.input_tuples.emplace_back(op_inputs[i]);
     // Thanks to the greatness of proto3, there is no way to query for
     // explicitly set fields, so the default for output_index (zero) means no
     // sub-index. As consequence, the real index is output_index - 1.
     if (input.output_index() == 0) {
-      TF_ASSIGN_OR_RETURN(xla::ShapedBuffer shaped_buffer,
-                          input_buffers.input_tuples.back()->ToShapedBuffer());
-      input_buffers.input_allocations.emplace_back(std::move(shaped_buffer));
+      input_tuples.emplace_back(op_inputs[i]);
     } else {
-      TF_ASSIGN_OR_RETURN(xla::ShapedBuffer shaped_buffer,
-                          input_buffers.input_tuples.back()->ToShapedBuffer());
-      TF_ASSIGN_OR_RETURN(
-          xla::ShapedBuffer sub_shaped_buffer,
-          shaped_buffer.SubShapedBuffer({input.output_index() - 1}));
-      input_buffers.input_allocations.emplace_back(
-          std::move(sub_shaped_buffer));
+      XRTTupleAllocation* sub_tuple;
+      TF_RETURN_IF_ERROR(XRTTupleAllocation::MakeSubBuffer(
+          op_inputs[i].get(), {input.output_index() - 1}, &sub_tuple,
+          /*alias_parent_allocation=*/true));
+      input_tuples.emplace_back(sub_tuple);
     }
   }
-  for (size_t i = 0; i < input_buffers.input_allocations.size(); ++i) {
-    input_buffers.input_pointers.push_back(&input_buffers.input_allocations[i]);
-  }
-  return std::move(input_buffers);
+  return input_tuples;
 }
 
 // Given a shape, returns a byte array representing the shape metadata of the
@@ -228,12 +198,11 @@ Status UpdateMetadata(se::Stream* stream, se::DeviceMemory<uint8>* buffer,
 // As we can't expand the size of an existing memory allocation, a reallocation
 // is required. A list of new allocations are returned after this function. The
 // caller is reponsible for maintaining those allocations.
-xla::StatusOr<std::vector<se::OwningDeviceMemory>> UpdateDynamicInputs(
+Status UpdateDynamicInputs(
     se::Stream* stream, se::DeviceMemoryAllocator* allocator,
-    std::vector<xla::ShapedBuffer*> runtime_inputs,
+    std::vector<xla::ExecutionInput>* execution_inputs,
     const std::vector<xla::ShapeLayout>& compile_time_shapes) {
-  std::vector<se::OwningDeviceMemory> new_allocations;
-  TF_RET_CHECK(runtime_inputs.size() == compile_time_shapes.size());
+  TF_RET_CHECK(execution_inputs->size() == compile_time_shapes.size());
   TF_ASSIGN_OR_RETURN(auto compiler, xla::Compiler::GetForPlatform(
                                          stream->parent()->platform()));
   auto shape_size_fn = compiler->ShapeSizeBytesFunction();
@@ -242,146 +211,103 @@ xla::StatusOr<std::vector<se::OwningDeviceMemory>> UpdateDynamicInputs(
     if (compile_time_shape.is_static()) {
       continue;
     }
-    auto* runtime_input = runtime_inputs[i];
-
+    xla::ExecutionInput* execution_input = &(*execution_inputs)[i];
     bool element_modified = false;
     TF_RETURN_IF_ERROR(xla::ShapeUtil::ForEachSubshapeWithStatus(
         compile_time_shape,
-        [&](const xla::Shape& compile_time_shape,
+        [&](const xla::Shape& sub_shape,
             const xla::ShapeIndex& index) -> Status {
-          if (compile_time_shape.IsTuple() || compile_time_shape.is_static()) {
+          if (sub_shape.IsTuple() || sub_shape.is_static()) {
             return Status::OK();
           }
-          const xla::Shape& runtime_shape = xla::ShapeUtil::GetSubshape(
-              runtime_input->on_device_shape(), index);
-          TF_RET_CHECK(!runtime_shape.IsTuple());
-          TF_RET_CHECK(xla::ShapeUtil::DynamicShapeIsCompatible(
-              runtime_shape, compile_time_shape));
-          se::DeviceMemoryBase* static_input =
-              runtime_input->buffers().mutable_element(index);
           TF_ASSIGN_OR_RETURN(
-              auto dynamic_input,
+              const xla::Shape* runtime_shape,
+              xla::ShapeUtil::TryGetSubshape(execution_input->shape(), index));
+          TF_RET_CHECK(!runtime_shape->IsTuple());
+          TF_RET_CHECK(xla::ShapeUtil::DynamicArrayShapeIsCompatible(
+              *runtime_shape, sub_shape));
+          TF_ASSIGN_OR_RETURN(
+              se::OwningDeviceMemory dynamic_input,
               allocator->Allocate(stream->parent()->device_ordinal(),
-                                  shape_size_fn(compile_time_shape)));
-          new_allocations.emplace_back(std::move(dynamic_input));
-          se::DeviceMemory<uint8>* dynamic_input_base =
-              new_allocations.back().ptr();
+                                  shape_size_fn(sub_shape)));
+
+          se::DeviceMemoryBase static_input =
+              execution_input->Buffer(index).AsDeviceMemoryBase();
+          se::DeviceMemory<uint8>* dynamic_input_base = dynamic_input.ptr();
           // Send the original data to the new location.
-          stream->ThenMemcpyD2D(dynamic_input_base, *static_input,
-                                static_input->size());
+          stream->ThenMemcpyD2D(dynamic_input_base, static_input,
+                                static_input.size());
           TF_RETURN_IF_ERROR(UpdateMetadata(stream, dynamic_input_base,
-                                            compile_time_shape, runtime_shape));
+                                            sub_shape, *runtime_shape));
           // Modify the memory location in the input shape tree to point to the
           // new input.
-          runtime_input->set_buffer(*dynamic_input_base, index);
+          execution_input->SetBuffer(
+              index, xla::MaybeOwningDeviceMemory(std::move(dynamic_input)));
+          execution_input->ClearUnownedIndex(index);
           element_modified = true;
           return Status::OK();
         }));
     if (element_modified) {
-      runtime_input->set_shapes(compile_time_shape, compile_time_shape);
+      TF_RETURN_IF_ERROR(execution_input->SetDynamicShape(compile_time_shape));
+      TF_ASSIGN_OR_RETURN(xla::ShapedBuffer shaped_buffer,
+                          execution_input->ToShapedBuffer(
+                              allocator, stream->parent()->device_ordinal()));
       // The input location has been modified, need to fix tuple table to
       // point to the correct address.
       TF_ASSIGN_OR_RETURN(
           auto transfer_manager,
           xla::TransferManager::GetForPlatform(stream->parent()->platform()));
       TF_RETURN_IF_ERROR(
-          transfer_manager->WriteTupleIndexTablesAsync(stream, *runtime_input));
+          transfer_manager->WriteTupleIndexTablesAsync(stream, shaped_buffer));
     }
   }
-  return std::move(new_allocations);
-}
-
-xla::StatusOr<xla::Literal> ReadMetadataLiteral(
-    se::Stream* stream, se::DeviceMemoryBase* buffer,
-    const xla::Shape& buffer_shape, xla::TransferManager* transfer_manager) {
-  TF_ASSIGN_OR_RETURN(auto compiler, xla::Compiler::GetForPlatform(
-                                         stream->parent()->platform()));
-  auto shape_size_fn = compiler->ShapeSizeBytesFunction();
-  xla::Shape buffer_shape_static =
-      xla::ShapeUtil::MakeStaticShape(buffer_shape);
-  const int64 offset = shape_size_fn(buffer_shape_static);
-  int64 metadata_size = shape_size_fn(buffer_shape) - offset;
-  TF_RET_CHECK(metadata_size != 0);
-  auto buffer_8 = se::DeviceMemory<uint8>(*buffer);
-  auto metadata_buffer =
-      stream->parent()->GetSubBuffer(&buffer_8, offset, metadata_size);
-  return transfer_manager->TransferArrayFromDevice(
-      stream,
-      xla::ShapeUtil::MakeShape(xla::S32, {buffer_shape.dimensions_size()}),
-      metadata_buffer);
-}
-
-// For each subshape in the result buffer that's dynamic, read the dynamic
-// dimension sizes from the metadata, and update output shapes. The result shape
-// is a static and concrete shape.
-xla::Status UpdateDynamicOutputs(se::Stream* stream,
-                                 xla::ShapedBuffer* shaped_buffer,
-                                 xla::Shape* output_host_shape,
-                                 xla::Shape* output_device_shape) {
-  DCHECK(output_device_shape->is_dynamic());
-  TF_ASSIGN_OR_RETURN(
-      auto transfer_manager,
-      xla::TransferManager::GetForPlatform(stream->parent()->platform()));
-  TF_RETURN_IF_ERROR(stream->BlockHostUntilDone());
-  TF_RETURN_IF_ERROR(shaped_buffer->buffers().ForEachMutableElementWithStatus(
-      [&](const xla::ShapeIndex& index, se::DeviceMemoryBase* buffer) {
-        const xla::Shape& buffer_shape =
-            xla::ShapeUtil::GetSubshape(*output_device_shape, index);
-        if (buffer_shape.IsTuple()) {
-          return Status::OK();
-        }
-        xla::Shape& host_shape =
-            *xla::ShapeUtil::GetMutableSubshape(output_host_shape, index);
-        xla::Shape& device_shape =
-            *xla::ShapeUtil::GetMutableSubshape(output_device_shape, index);
-        if (device_shape.is_static()) {
-          return Status::OK();
-        }
-        TF_ASSIGN_OR_RETURN(auto metadata,
-                            ReadMetadataLiteral(stream, buffer, buffer_shape,
-                                                transfer_manager));
-        // Update shape size from metadata.
-        for (int64 i = 0; i < metadata.element_count(); ++i) {
-          host_shape.mutable_dimensions()[i] = metadata.Get<int32>({i});
-          device_shape.mutable_dimensions()[i] = metadata.Get<int32>({i});
-        }
-        return Status::OK();
-      }));
-  output_host_shape->clear_dynamic_dimensions();
-  output_device_shape->clear_dynamic_dimensions();
   return Status::OK();
 }
 
-// Create output tuple from run_result.
 xla::StatusOr<RefPtr<XRTTupleAllocation>> CreateOutputTuple(
-    se::Stream* stream, xla::ScopedShapedBuffer run_result,
-    xla::Backend* backend, int device_ordinal) {
+    se::Stream* stream, xla::ExecutionOutput run_result, xla::Backend* backend,
+    int device_ordinal) {
   XRTTupleAllocation* output_tuple;
-  xla::ShapedBuffer shaped_buffer = run_result.release();
-  if (shaped_buffer.on_device_shape().is_dynamic()) {
+  xla::ScopedShapedBuffer* shaped_buffer = run_result.MutableResult();
+  if (shaped_buffer->on_device_shape().is_dynamic()) {
     // Update dynamic shapes from output buffer, and create a XRT tensor with
     // dimension sizes read from metadata.
-    xla::Shape output_host_shape = shaped_buffer.on_host_shape();
-    xla::Shape output_device_shape = shaped_buffer.on_device_shape();
-    TF_RETURN_IF_ERROR(UpdateDynamicOutputs(
-        stream, &shaped_buffer, &output_host_shape, &output_device_shape));
+    xla::Shape output_host_shape = shaped_buffer->on_host_shape();
+    xla::Shape output_device_shape = shaped_buffer->on_device_shape();
+    TF_ASSIGN_OR_RETURN(
+        auto transfer_manager,
+        xla::TransferManager::GetForPlatform(stream->parent()->platform()));
+    TF_RETURN_IF_ERROR(transfer_manager->ReadDynamicShapes(
+        stream, shaped_buffer, &output_host_shape, &output_device_shape));
     TF_RETURN_IF_ERROR(XRTTupleAllocation::CreateFromBuffer(
-        shaped_buffer, output_host_shape, output_device_shape, backend,
+        *shaped_buffer, output_host_shape, output_device_shape, backend,
         device_ordinal, &output_tuple));
   } else {
     // Fast-path: Don't copy shapes of output buffer.
     TF_RETURN_IF_ERROR(XRTTupleAllocation::CreateFromBuffer(
-        shaped_buffer, backend, device_ordinal, &output_tuple));
+        *shaped_buffer, backend, device_ordinal, &output_tuple));
   }
+  // After the output tuple is created, we can release the output result
+  // buffers, to make sure they won't be cleared by its destructor.
+  (void)run_result.ConsumeResult().release();
   return RefPtr<XRTTupleAllocation>(output_tuple);
 }
 
 xla::StatusOr<RefPtr<XRTTupleAllocation>> RunExecutable(
     OpKernelContext* context, XRTGenericDeviceAccessor::ScopedRef* device_ref,
-    xla::LocalExecutable* executable, const InputBuffers& input_buffers,
-    se::Stream* stream, int rng_seed,
+    xla::LocalExecutable* executable,
+    absl::Span<const RefPtr<XRTTupleAllocation>> input_tuples,
+    bool release_inputs, se::Stream* stream, int rng_seed,
     const xrt::CommonExecutionConfig& config) {
-  VLOG(2) << "Executing computation.";
+  const xla::ComputationLayout& computation_layout =
+      executable->executable()->module_config().entry_computation_layout();
+  std::vector<bool> input_is_dynamic = GetDynamicInputInfo(computation_layout);
+  TF_ASSIGN_OR_RETURN(
+      std::vector<xla::ExecutionInput> execution_inputs,
+      GetArgumentsBuffers(
+          executable->executable()->module().input_output_alias_config(),
+          input_tuples, input_is_dynamic, release_inputs));
+
   xla::ExecutableRunOptions run_options;
   run_options.set_stream(stream);
   run_options.set_allocator(device_ref->backend()->memory_allocator());
@@ -419,51 +345,28 @@ xla::StatusOr<RefPtr<XRTTupleAllocation>> RunExecutable(
   }
   run_options.set_gpu_executable_run_options(&gpu_options);
 
-  Env* env = Env::Default();
-  auto start_time = env->NowMicros();
   const std::vector<xla::ShapeLayout>& shape_layouts =
       executable->executable()
           ->module_config()
           .entry_computation_layout()
           .parameter_layouts();
-  TF_ASSIGN_OR_RETURN(auto new_allocations,
-                      UpdateDynamicInputs(stream, run_options.allocator(),
-                                          input_buffers.input_pointers,
-                                          shape_layouts));
-  auto new_allocations_ptr =
-      std::make_shared<std::vector<se::OwningDeviceMemory>>(
-          std::move(new_allocations));
+  TF_RETURN_IF_ERROR(UpdateDynamicInputs(stream, run_options.allocator(),
+                                         &execution_inputs, shape_layouts));
   TF_ASSIGN_OR_RETURN(
-      xla::ScopedShapedBuffer run_result,
-      executable->Run(input_buffers.input_pointers, run_options));
-  // Retain the new allocation for input memory until the end of execution.
-  stream->ThenDoHostCallback([new_allocations_ptr]() { return Status::OK(); });
-
-  auto elapsed = env->NowMicros() - start_time;
-  VLOG(2) << "Elapsed time: " << elapsed << "us";
+      xla::ExecutionOutput run_result,
+      executable->Run(std::move(execution_inputs), run_options));
 
   TF_ASSIGN_OR_RETURN(
       RefPtr<XRTTupleAllocation> output_tuple_ptr,
       CreateOutputTuple(stream, std::move(run_result), device_ref->backend(),
                         device_ref->device_ordinal()));
-
   // The ScopedShapedBuffer returned by the executable Run() API, in case of
   // input/output buffer aliasing, might have holes in it, which need to be
   // filled using the proper input tuples buffers which are the source of
   // aliasing.
-  const xla::HloInputOutputAliasConfig& input_output_alias =
-      executable->executable()->module().input_output_alias_config();
-  auto alias_function =
-      [&](const xla::ShapeIndex& output_index,
-          const xla::HloInputOutputAliasConfig::Alias& alias) -> Status {
-    TF_RET_CHECK(alias.parameter_number < input_buffers.input_tuples.size());
-    return alias.kind == xla::HloInputOutputAliasConfig::AliasKind::kUserAlias
-               ? output_tuple_ptr->AliasBufferFrom(
-                     *input_buffers.input_tuples[alias.parameter_number],
-                     alias.parameter_index, output_index)
-               : Status::OK();
-  };
-  TF_RETURN_IF_ERROR(input_output_alias.ForEachAliasWithStatus(alias_function));
+  TF_RETURN_IF_ERROR(RebuildOutputAliases(
+      output_tuple_ptr, input_tuples,
+      executable->executable()->module().input_output_alias_config()));
 
   return std::move(output_tuple_ptr);
 }
@@ -471,12 +374,13 @@ xla::StatusOr<RefPtr<XRTTupleAllocation>> RunExecutable(
 xla::StatusOr<RefPtr<XRTTupleAllocation>> ExecuteComputation(
     OpKernelContext* context, XRTMemoryManager* memory_manager,
     XRTGenericDeviceAccessor::ScopedRef* device_ref,
-    xla::LocalExecutable* executable, const InputBuffers& input_buffers,
-    se::Stream* stream, int rng_seed,
+    xla::LocalExecutable* executable,
+    absl::Span<const RefPtr<XRTTupleAllocation>> input_tuples,
+    bool release_inputs, se::Stream* stream, int rng_seed,
     const xrt::CommonExecutionConfig& config) {
   auto runfn = [&]() {
-    return RunExecutable(context, device_ref, executable, input_buffers, stream,
-                         rng_seed, config);
+    return RunExecutable(context, device_ref, executable, input_tuples,
+                         release_inputs, stream, rng_seed, config);
   };
 
   // We pass zero as requested_free_size as there is no simple way to get the
@@ -495,12 +399,13 @@ xla::StatusOr<RefPtr<XRTTupleAllocation>> ExecuteComputation(
     se::Stream* stream, int rng_seed,
     const xrt::CommonExecutionConfig& config) {
   XRTMemoryManager::WorkingSet working_set(memory_manager);
-  TF_ASSIGN_OR_RETURN(InputBuffers input_buffers,
-                      GetInputBuffers(&working_set, device_ref->backend(),
-                                      input_coords, release_inputs));
+  TF_ASSIGN_OR_RETURN(
+      std::vector<RefPtr<XRTTupleAllocation>> input_tuples,
+      GetInputTuples(executable, &working_set, device_ref->backend(),
+                     input_coords, release_inputs));
   return ExecuteComputation(context, memory_manager.get(), device_ref,
-                            executable, input_buffers, stream, rng_seed,
-                            config);
+                            executable, input_tuples, release_inputs, stream,
+                            rng_seed, config);
 }
 
 // XRTExecuteOp
@@ -653,16 +558,16 @@ Status XRTExecuteChainedOp::DoWork(OpKernelContext* context) {
   auto execute_op = [&](const xrt::XRTChainedExecuteOp& op,
                         absl::Span<const RefPtr<XRTTupleAllocation>> op_inputs)
       -> xla::StatusOr<RefPtr<XRTTupleAllocation>> {
-    TF_ASSIGN_OR_RETURN(InputBuffers input_buffers,
-                        GetChainedOpInputs(op, op_inputs));
-
     std::unique_ptr<XRTCompilationCacheEntryRef> entry;
     TF_RETURN_IF_ERROR(cache->Lookup(op.computation_handle(), &entry));
     xla::LocalExecutable* executable = entry->get().get_executable();
 
-    return ExecuteComputation(context, memory_manager.get(), &device_ref,
-                              executable, input_buffers, stream, rng_seed,
-                              config.common_config());
+    TF_ASSIGN_OR_RETURN(std::vector<RefPtr<XRTTupleAllocation>> input_tuples,
+                        GetChainedOpInputTuples(op, op_inputs));
+
+    return ExecuteComputation(
+        context, memory_manager.get(), &device_ref, executable, input_tuples,
+        /*release_inputs=*/false, stream, rng_seed, config.common_config());
   };
 
   return ExecuteChained(context, memory_manager, device_ref.backend(),
diff --git a/tensorflow/compiler/xrt/xrt_util.cc b/tensorflow/compiler/xrt/xrt_util.cc
index b8a0afc92c5..926ba23c7af 100644
--- a/tensorflow/compiler/xrt/xrt_util.cc
+++ b/tensorflow/compiler/xrt/xrt_util.cc
@@ -221,6 +221,140 @@ xla::StatusOr<std::vector<InputCoords>> GetComputationInputs(
   return std::move(input_coords);
 }
 
+bool InputShapeMatches(const xla::Shape& parameter_shape,
+                       const xla::Shape& input_shape) {
+  auto shape_checker = [&](const xla::Shape& pshape,
+                           const xla::ShapeIndex& index) {
+    if (pshape.IsArray()) {
+      TF_ASSIGN_OR_RETURN(const xla::Shape* ishape,
+                          xla::ShapeUtil::TryGetSubshape(input_shape, index));
+      if (pshape.rank() != ishape->rank() ||
+          pshape.element_type() != ishape->element_type()) {
+        return errors::InvalidArgument("Mismatching shapes");
+      }
+      if (pshape.is_static() && pshape.layout() != ishape->layout()) {
+        return errors::InvalidArgument("Mismatching layouts");
+      }
+      for (int64 dim = 0; dim < pshape.rank(); ++dim) {
+        if (pshape.is_dynamic_dimension(dim)) {
+          if (pshape.dimensions(dim) < ishape->dimensions(dim)) {
+            return errors::InvalidArgument("Mismatching shapes");
+          }
+        } else if (pshape.dimensions(dim) != ishape->dimensions(dim)) {
+          return errors::InvalidArgument("Mismatching shapes");
+        }
+      }
+    }
+    return Status::OK();
+  };
+  return xla::ShapeUtil::ForEachSubshapeWithStatus(parameter_shape,
+                                                   shape_checker)
+      .ok();
+}
+
+xla::StatusOr<std::vector<RefPtr<XRTTupleAllocation>>> GetInputTupleAllocations(
+    const std::vector<InputCoords>& input_coords,
+    XRTMemoryManager::WorkingSet* working_set, xla::Backend* backend,
+    int64 num_input_shapes,
+    const std::function<xla::Shape(int64)>& shape_getter, bool release_inputs) {
+  if (input_coords.size() != num_input_shapes) {
+    return errors::InvalidArgument(
+        "Number of inputs does not match executable proto input shapes: ",
+        input_coords.size(), " vs. ", num_input_shapes);
+  }
+  std::vector<RefPtr<XRTTupleAllocation>> input_tuples;
+  input_tuples.reserve(input_coords.size());
+  for (size_t i = 0; i < input_coords.size(); ++i) {
+    TF_RETURN_IF_ERROR(
+        working_set->LookupAndPin(backend, input_coords[i].handle));
+    auto tuple = working_set->PinnedTuples().back();
+    if (release_inputs) {
+      // We are holding a reference to the tuple, so we can safely delete it
+      // from the resource manager here.
+      TF_RETURN_IF_ERROR(
+          working_set->MemoryManager()->Release(input_coords[i].handle));
+      VLOG(2) << "Released allocation handle " << input_coords[i].handle;
+    }
+    xla::Shape input_shape = shape_getter(i);
+    if (!InputShapeMatches(input_shape, tuple->on_host_shape())) {
+      return errors::InvalidArgument(
+          "Run-time shape mismatch for XRTExecute argument[", i, "] (",
+          input_coords[i].handle, "). Expected ", input_shape.DebugString(),
+          "; got ", tuple->on_host_shape().DebugString());
+    }
+    if (input_coords[i].index.empty()) {
+      input_tuples.emplace_back(std::move(tuple));
+    } else {
+      XRTTupleAllocation* sub_tuple;
+      TF_RETURN_IF_ERROR(XRTTupleAllocation::MakeSubBuffer(
+          tuple.get(), input_coords[i].index, &sub_tuple,
+          /*alias_parent_allocation=*/true));
+      input_tuples.emplace_back(sub_tuple);
+    }
+  }
+  return std::move(input_tuples);
+}
+
+Status RebuildOutputAliases(
+    const RefPtr<XRTTupleAllocation>& output_tuple,
+    absl::Span<const RefPtr<XRTTupleAllocation>> input_tuples,
+    const xla::HloInputOutputAliasConfig& input_output_alias) {
+  auto alias_function =
+      [&](const xla::ShapeIndex& output_index,
+          const xla::HloInputOutputAliasConfig::Alias& alias) -> Status {
+    TF_RET_CHECK(alias.parameter_number < input_tuples.size());
+    return alias.kind == xla::HloInputOutputAliasConfig::AliasKind::kUserAlias
+               ? output_tuple->AliasBufferFrom(
+                     *input_tuples[alias.parameter_number],
+                     alias.parameter_index, output_index)
+               : Status::OK();
+  };
+  return input_output_alias.ForEachAliasWithStatus(alias_function);
+}
+
+xla::StatusOr<std::vector<xla::ExecutionInput>> GetArgumentsBuffers(
+    const xla::HloInputOutputAliasConfig& input_output_alias,
+    absl::Span<const RefPtr<XRTTupleAllocation>> input_tuples,
+    const std::vector<bool>& input_is_dynamic, bool release_inputs) {
+  auto is_dynamic = [&](size_t arg) {
+    return arg < input_is_dynamic.size() && input_is_dynamic[arg];
+  };
+  std::vector<xla::ExecutionInput> arguments;
+  // Don't alias dynamic input -- Due to the underlying implementation,
+  // aliased inputs have two owners: XRTAllocation and return value of
+  // this function. If an argument is dynamic and the ownership is
+  // released to output of this function, TPUExecute will free it and
+  // reallocate a new one, which creates a double freeing issue where
+  // XRTAllocation also attempts to release the buffer.
+  bool alias_outputs = release_inputs && input_tuples.size() == 1 &&
+                       input_tuples[0]->IsExclusiveOwner() && !is_dynamic(0);
+  arguments.reserve(input_tuples.size());
+  for (int64 i = 0; i < input_tuples.size(); ++i) {
+    auto alias_checker =
+        [&](const xla::ShapeIndex& index) -> xla::StatusOr<bool> {
+      // Only the buffers which the caller explicitly marked as aliased
+      // (kUserAlias), should create aliases.
+      // The XLA compiler might create opportunistic aliases (kSystemAlias)
+      // which need a different handling. With a system alias we know that XLA
+      // is going to reuse a given input parameter buffer for a given output, so
+      // unless it is known at call site that the input buffer has no more uses,
+      // a copy needs to be made at call site. With user specified alias the
+      // caller tells us that he expects a given output to land over the buffers
+      // of a given parametter.
+      if (input_output_alias.ParameterAliasKind(i, index) ==
+          xla::HloInputOutputAliasConfig::AliasKind::kUserAlias) {
+        TF_RET_CHECK(!is_dynamic(i));
+        return true;
+      }
+      return alias_outputs;
+    };
+    TF_ASSIGN_OR_RETURN(xla::ExecutionInput exec_input,
+                        input_tuples[i]->ToExecutionInput(alias_checker));
+    arguments.emplace_back(std::move(exec_input));
+  }
+  return std::move(arguments);
+}
+
 Status CreateExecuteOutput(OpKernelContext* context,
                            XRTMemoryManager* memory_manager,
                            RefPtr<XRTTupleAllocation> output_tuple,
diff --git a/tensorflow/compiler/xrt/xrt_util.h b/tensorflow/compiler/xrt/xrt_util.h
index cc1480fdb00..832c106621f 100644
--- a/tensorflow/compiler/xrt/xrt_util.h
+++ b/tensorflow/compiler/xrt/xrt_util.h
@@ -23,6 +23,8 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/compiler/xla/service/backend.h"
+#include "tensorflow/compiler/xla/service/hlo_input_output_alias_config.h"
+#include "tensorflow/compiler/xla/shape.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/xla.pb.h"
@@ -69,6 +71,25 @@ xla::DebugOptions BuildXlaDebugOptions(const xla::DebugOptions& ref_options);
 xla::StatusOr<std::vector<InputCoords>> GetComputationInputs(
     OpKernelContext* context, const char* input_name);
 
+bool InputShapeMatches(const xla::Shape& parameter_shape,
+                       const xla::Shape& input_shape);
+
+xla::StatusOr<std::vector<RefPtr<XRTTupleAllocation>>> GetInputTupleAllocations(
+    const std::vector<InputCoords>& input_coords,
+    XRTMemoryManager::WorkingSet* working_set, xla::Backend* backend,
+    int64 num_input_shapes,
+    const std::function<xla::Shape(int64)>& shape_getter, bool release_inputs);
+
+Status RebuildOutputAliases(
+    const RefPtr<XRTTupleAllocation>& output_tuple,
+    absl::Span<const RefPtr<XRTTupleAllocation>> input_tuples,
+    const xla::HloInputOutputAliasConfig& input_output_alias);
+
+xla::StatusOr<std::vector<xla::ExecutionInput>> GetArgumentsBuffers(
+    const xla::HloInputOutputAliasConfig& input_output_alias,
+    absl::Span<const RefPtr<XRTTupleAllocation>> input_tuples,
+    const std::vector<bool>& input_is_dynamic, bool release_inputs);
+
 // Create the XRT execute output tensor given the computation result
 // (output_tuple). The return_exploded_tuple tells whether a tuple result should
 // be returned as vector of handles representing each tuple child.
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 50f1f2527a5..d0be6ee9597 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -72,6 +72,7 @@ load(
     "if_ios",
     "if_mobile",
     "if_not_windows",
+    "if_tpu",
     "tf_android_core_proto_headers",
     "tf_cc_test",
     "tf_cc_test_mkl",
@@ -1093,6 +1094,8 @@ cc_library(
     ]) + if_tensorrt([
         "//tensorflow/compiler/tf2tensorrt:trt_engine_resource_op_kernels",
         "//tensorflow/compiler/tf2tensorrt:trt_op_kernels",
+    ]) + if_tpu([
+        "//tensorflow/core/tpu/kernels",
     ]),
 )
 
@@ -1861,7 +1864,9 @@ cc_library(
         "//tensorflow/core/lib/io:random_inputstream",
         "//tensorflow/core/lib/io:record_reader",
         "//tensorflow/core/lib/io:record_writer",
+        "//tensorflow/core/lib/io:snappy_compression_options",
         "//tensorflow/core/lib/io:snappy_inputbuffer",
+        "//tensorflow/core/lib/io:snappy_inputstream",
         "//tensorflow/core/lib/io:snappy_outputbuffer",
         "//tensorflow/core/lib/io:table",
         "//tensorflow/core/lib/io:table_options",
@@ -2254,7 +2259,7 @@ tf_cuda_library(
         "//tensorflow/core/platform/default/build_config:platformlib",
         "//tensorflow/core/profiler/lib:annotated_traceme",
         "//tensorflow/core/profiler/lib:traceme",
-        "//tensorflow/core/tpu:tpu_library_loader",
+        "//tensorflow/core/tpu:tpu_api_dlsym_initializer",
         "//tensorflow/core/util:einsum_op_util",
         "//tensorflow/core/util:padding",
         "//tensorflow/core/util:port",
diff --git a/tensorflow/core/api_def/base_api/api_def_BandedTriangularSolve.pbtxt b/tensorflow/core/api_def/base_api/api_def_BandedTriangularSolve.pbtxt
new file mode 100644
index 00000000000..ba5e1bdcaf2
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_BandedTriangularSolve.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "BandedTriangularSolve"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_BesselI0.pbtxt b/tensorflow/core/api_def/base_api/api_def_BesselI0.pbtxt
new file mode 100644
index 00000000000..2c47960429c
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_BesselI0.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "BesselI0"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_BesselI0e.pbtxt b/tensorflow/core/api_def/base_api/api_def_BesselI0e.pbtxt
index 08313cebb99..7965af4916e 100644
--- a/tensorflow/core/api_def/base_api/api_def_BesselI0e.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_BesselI0e.pbtxt
@@ -1,10 +1,4 @@
 op {
   graph_op_name: "BesselI0e"
-  summary: "Computes the Bessel i0e function of `x` element-wise."
-  description: <<END
-Exponentially scaled modified Bessel function of order 0 defined as
-`bessel_i0e(x) = exp(-abs(x)) bessel_i0(x)`.
-
-This function is faster and numerically stabler than `bessel_i0(x)`.
-END
+  visibility: HIDDEN
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_BesselI1.pbtxt b/tensorflow/core/api_def/base_api/api_def_BesselI1.pbtxt
new file mode 100644
index 00000000000..e0007b44162
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_BesselI1.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "BesselI1"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_BesselI1e.pbtxt b/tensorflow/core/api_def/base_api/api_def_BesselI1e.pbtxt
index 3e46a9506f5..dffd296f6d8 100644
--- a/tensorflow/core/api_def/base_api/api_def_BesselI1e.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_BesselI1e.pbtxt
@@ -1,10 +1,4 @@
 op {
   graph_op_name: "BesselI1e"
-  summary: "Computes the Bessel i1e function of `x` element-wise."
-  description: <<END
-Exponentially scaled modified Bessel function of order 0 defined as
-`bessel_i1e(x) = exp(-abs(x)) bessel_i1(x)`.
-
-This function is faster and numerically stabler than `bessel_i1(x)`.
-END
+  visibility: HIDDEN
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_BesselJ0.pbtxt b/tensorflow/core/api_def/base_api/api_def_BesselJ0.pbtxt
new file mode 100644
index 00000000000..4010afadcb8
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_BesselJ0.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "BesselJ0"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_BesselJ1.pbtxt b/tensorflow/core/api_def/base_api/api_def_BesselJ1.pbtxt
new file mode 100644
index 00000000000..12d16910227
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_BesselJ1.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "BesselJ1"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_BesselK0.pbtxt b/tensorflow/core/api_def/base_api/api_def_BesselK0.pbtxt
new file mode 100644
index 00000000000..31d701c821b
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_BesselK0.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "BesselK0"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_BesselK0e.pbtxt b/tensorflow/core/api_def/base_api/api_def_BesselK0e.pbtxt
new file mode 100644
index 00000000000..fac0c1b3459
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_BesselK0e.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "BesselK0e"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_BesselK1.pbtxt b/tensorflow/core/api_def/base_api/api_def_BesselK1.pbtxt
new file mode 100644
index 00000000000..de80f304540
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_BesselK1.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "BesselK1"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_BesselK1e.pbtxt b/tensorflow/core/api_def/base_api/api_def_BesselK1e.pbtxt
new file mode 100644
index 00000000000..c565a85def2
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_BesselK1e.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "BesselK1e"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_BesselY0.pbtxt b/tensorflow/core/api_def/base_api/api_def_BesselY0.pbtxt
new file mode 100644
index 00000000000..af57e504d65
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_BesselY0.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "BesselY0"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_BesselY1.pbtxt b/tensorflow/core/api_def/base_api/api_def_BesselY1.pbtxt
new file mode 100644
index 00000000000..b2cd9827f6f
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_BesselY1.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "BesselY1"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_DecodeImage.pbtxt b/tensorflow/core/api_def/base_api/api_def_DecodeImage.pbtxt
new file mode 100644
index 00000000000..c534425eb24
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_DecodeImage.pbtxt
@@ -0,0 +1,51 @@
+op {
+  graph_op_name: "DecodeImage"
+  in_arg {
+    name: "contents"
+    description: <<END
+0-D. The encoded image bytes.
+END
+  }
+  out_arg {
+    name: "image"
+    description: <<END
+3-D with shape `[height, width, channels]` or 4-D with shape
+`[frame, height, width, channels]`..
+END
+  }
+  attr {
+    name: "channels"
+    description: <<END
+Number of color channels for the decoded image.
+END
+  }
+  attr {
+    name: "dtype"
+    description: <<END
+The desired DType of the returned Tensor.
+END
+  }
+  attr {
+    name: "expand_animations"
+    description: <<END
+Controls the output shape of the returned op. If True, the returned op will
+produce a 3-D tensor for PNG, JPEG, and BMP files; and a 4-D tensor for all
+GIFs, whether animated or not. If, False, the returned op will produce a 3-D
+tensor for all file types and will truncate animated GIFs to the first frame.
+END
+  }
+  summary: "Function for decode_bmp, decode_gif, decode_jpeg, and decode_png."
+  description: <<END
+Detects whether an image is a BMP, GIF, JPEG, or PNG, and performs the
+appropriate operation to convert the input bytes string into a Tensor of type
+dtype.
+
+*NOTE*: decode_gif returns a 4-D array [num_frames, height, width, 3], as
+opposed to decode_bmp, decode_jpeg and decode_png, which return 3-D arrays
+[height, width, num_channels]. Make sure to take this into account when
+constructing your graph if you are intermixing GIF files with BMP, JPEG, and/or
+PNG files. Alternately, set the expand_animations argument of this function to
+False, in which case the op will return 3-dimensional tensors and will truncate
+animated GIF files to the first frame.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_DeviceIndex.pbtxt b/tensorflow/core/api_def/base_api/api_def_DeviceIndex.pbtxt
index 9a4e5abd110..87c146910ff 100644
--- a/tensorflow/core/api_def/base_api/api_def_DeviceIndex.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_DeviceIndex.pbtxt
@@ -2,4 +2,10 @@ op {
   graph_op_name: "DeviceIndex"
   visibility: HIDDEN
   summary: "Return the index of device the op runs."
+  description: <<END
+Given a list of device names, this operation returns the index of the device
+this op runs. The length of the list is returned in two cases:
+(1) Device does not exist in the given device list.
+(2) It is in XLA compilation.
+END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_StatelessParameterizedTruncatedNormal.pbtxt b/tensorflow/core/api_def/base_api/api_def_StatelessParameterizedTruncatedNormal.pbtxt
new file mode 100644
index 00000000000..15bd4670cef
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_StatelessParameterizedTruncatedNormal.pbtxt
@@ -0,0 +1,54 @@
+op {
+  graph_op_name: "StatelessParameterizedTruncatedNormal"
+  visibility: HIDDEN
+  in_arg {
+    name: "shape"
+    description: <<END
+The shape of the output tensor.
+END
+  }
+  in_arg {
+    name: "seed"
+    description: <<END
+2 seeds (shape [2]).
+END
+  }
+  in_arg {
+    name: "means"
+    description: <<END
+The mean parameter of each batch.
+END
+  }
+  in_arg {
+    name: "stddevs"
+    description: <<END
+The standard deviation parameter of each batch. Must be greater than 0.
+END
+  }
+  in_arg {
+    name: "minvals"
+    description: <<END
+The minimum cutoff. May be -infinity.
+END
+  }
+  in_arg {
+    name: "maxvals"
+    description: <<END
+The maximum cutoff. May be +infinity, and must be more than the minval
+for each batch.
+END
+  }
+  attr {
+    name: "dtype"
+    description: <<END
+The type of the output.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+The outputs are truncated normal samples and are a deterministic function of
+`shape`, `seed`, `minvals`, `maxvals`, `means` and `stddevs`.
+END
+  }
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_VarHandleOp.pbtxt b/tensorflow/core/api_def/base_api/api_def_VarHandleOp.pbtxt
index 39606a07184..29ffcdaad6b 100644
--- a/tensorflow/core/api_def/base_api/api_def_VarHandleOp.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_VarHandleOp.pbtxt
@@ -28,8 +28,8 @@ END
   attr {
     name: "allowed_devices"
     description: <<END
-The allowed devices containing the resource variable. Set when the output
-ResourceHandle represents a per-replica/partitioned resource variable.
+DEPRECATED. The allowed devices containing the resource variable. Set when the
+output ResourceHandle represents a per-replica/partitioned resource variable.
 END
   }
   summary: "Creates a handle to a Variable resource."
diff --git a/tensorflow/core/api_def/python_api/api_def_BandedTriangularSolve.pbtxt b/tensorflow/core/api_def/python_api/api_def_BandedTriangularSolve.pbtxt
new file mode 100644
index 00000000000..ba5e1bdcaf2
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_BandedTriangularSolve.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "BandedTriangularSolve"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_BesselI0e.pbtxt b/tensorflow/core/api_def/python_api/api_def_BesselI0e.pbtxt
deleted file mode 100644
index fdbe5282bc1..00000000000
--- a/tensorflow/core/api_def/python_api/api_def_BesselI0e.pbtxt
+++ /dev/null
@@ -1,6 +0,0 @@
-op {
-  graph_op_name: "BesselI0e"
-  endpoint {
-    name: "math.bessel_i0e"
-  }
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_BesselI1e.pbtxt b/tensorflow/core/api_def/python_api/api_def_BesselI1e.pbtxt
deleted file mode 100644
index 3f08cd766d8..00000000000
--- a/tensorflow/core/api_def/python_api/api_def_BesselI1e.pbtxt
+++ /dev/null
@@ -1,6 +0,0 @@
-op {
-  graph_op_name: "BesselI1e"
-  endpoint {
-    name: "math.bessel_i1e"
-  }
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_DecodeImage.pbtxt b/tensorflow/core/api_def/python_api/api_def_DecodeImage.pbtxt
new file mode 100644
index 00000000000..54c4f6eeeee
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_DecodeImage.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "DecodeImage"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/common_runtime/composite_device.cc b/tensorflow/core/common_runtime/composite_device.cc
index 7fd41e00a04..d4548946cbf 100644
--- a/tensorflow/core/common_runtime/composite_device.cc
+++ b/tensorflow/core/common_runtime/composite_device.cc
@@ -25,6 +25,16 @@ const char* const kCompositeDeviceType = "COMPOSITE";
 std::unique_ptr<CompositeDevice> CompositeDevice::MakeDevice(
     const std::vector<string>& underlying_devices, const int unique_device_id,
     const DeviceNameUtils::ParsedName& host_name, Status* status) {
+  DeviceNameUtils::ParsedName parsed_name = host_name;
+  parsed_name.type = kCompositeDeviceType;
+  parsed_name.id = unique_device_id;
+  const string device_name = DeviceNameUtils::ParsedNameToString(parsed_name);
+  return CompositeDevice::MakeDevice(underlying_devices, device_name, status);
+}
+
+std::unique_ptr<CompositeDevice> CompositeDevice::MakeDevice(
+    const std::vector<string>& underlying_devices, const string& device_name,
+    Status* status) {
   if (underlying_devices.empty()) {
     status->Update(
         errors::InvalidArgument("underlying_devices should not be empty."));
@@ -63,13 +73,8 @@ std::unique_ptr<CompositeDevice> CompositeDevice::MakeDevice(
     }
   }
 
-  DeviceNameUtils::ParsedName parsed_composite_name = host_name;
   DeviceAttributes device_attributes;
-  parsed_composite_name.type = kCompositeDeviceType;
-  parsed_composite_name.id = unique_device_id;
-  const string composite_name =
-      DeviceNameUtils::ParsedNameToString(parsed_composite_name);
-  device_attributes.set_name(composite_name);
+  device_attributes.set_name(device_name);
   device_attributes.set_device_type(kCompositeDeviceType);
 
   return absl::WrapUnique(
diff --git a/tensorflow/core/common_runtime/composite_device.h b/tensorflow/core/common_runtime/composite_device.h
index 850eae55e8d..c68c395198a 100644
--- a/tensorflow/core/common_runtime/composite_device.h
+++ b/tensorflow/core/common_runtime/composite_device.h
@@ -48,6 +48,11 @@ class CompositeDevice : public Device {
       const std::vector<string>& underlying_devices, const int unique_device_id,
       const DeviceNameUtils::ParsedName& host_name, Status* status);
 
+  // Helper for creating a CompositeDevice with the given device name.
+  static std::unique_ptr<CompositeDevice> MakeDevice(
+      const std::vector<string>& underlying_devices, const string& device_name,
+      Status* status);
+
  private:
   CompositeDevice(const DeviceAttributes& device_attributes,
                   const std::vector<string>& underlying_devices)
diff --git a/tensorflow/core/common_runtime/composite_device_test.cc b/tensorflow/core/common_runtime/composite_device_test.cc
index 73a6ae44912..7d195a7a08e 100644
--- a/tensorflow/core/common_runtime/composite_device_test.cc
+++ b/tensorflow/core/common_runtime/composite_device_test.cc
@@ -80,4 +80,20 @@ TEST(CompositeDeviceTest, Basic) {
   }
 }
 
+TEST(CompositeDeviceTest, DeviceName) {
+  const string composite_device_name =
+      "/job:localhost/replica:0/task:0/device:CPU:10";
+  std::vector<string> underlying_devices;
+  underlying_devices.push_back("/job:worker/replica:0/task:0/device:CPU:0");
+  underlying_devices.push_back("/job:worker/replica:0/task:0/device:CPU:1");
+  Status status;
+  std::unique_ptr<CompositeDevice> composite_device =
+      CompositeDevice::MakeDevice(underlying_devices, composite_device_name,
+                                  &status);
+  TF_ASSERT_OK(status);
+  EXPECT_EQ(composite_device->name(), composite_device_name);
+  EXPECT_EQ(composite_device->device_type(), kCompositeDeviceType);
+  EXPECT_EQ(underlying_devices, *composite_device->underlying_devices());
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/eager/BUILD b/tensorflow/core/common_runtime/eager/BUILD
index e4f4c483209..911b59eed17 100644
--- a/tensorflow/core/common_runtime/eager/BUILD
+++ b/tensorflow/core/common_runtime/eager/BUILD
@@ -29,9 +29,12 @@ tf_cuda_library(
         ":context",
         ":eager_operation",
         ":execute",
+        ":placement_utils",
         ":tensor_handle",
         "//tensorflow/c:c_api_internal",
         "//tensorflow/c:tf_tensor_internal",
+        "//tensorflow/c/eager:abstract_function",
+        "//tensorflow/core/platform:errors",
     ],
     alwayslink = 1,
 )
@@ -74,9 +77,9 @@ tf_cuda_library(
         ":kernel_and_device",
         "@com_google_absl//absl/container:flat_hash_map",
         "//tensorflow/c:tf_tensor_internal",
-        "//tensorflow/c/eager:context_interface",
-        "//tensorflow/c/eager:tensor_handle_interface",
-        "//tensorflow/c/eager:operation_interface",
+        "//tensorflow/c/eager:immediate_execution_context",
+        "//tensorflow/c/eager:immediate_execution_tensor_handle",
+        "//tensorflow/c/eager:immediate_execution_operation",
         "//tensorflow/core/distributed_runtime:rendezvous_mgr_interface",
         "//tensorflow/core/distributed_runtime:worker_env",
     ] + select({
@@ -137,8 +140,10 @@ tf_cuda_library(
         "@com_google_absl//absl/types:span",
         "@com_google_absl//absl/types:variant",
         "//tensorflow/c:tf_tensor_internal",
-        "//tensorflow/c/eager:operation_interface",
-        "//tensorflow/c/eager:tensor_handle_interface",
+        "//tensorflow/c/eager:immediate_execution_operation",
+        "//tensorflow/c/eager:abstract_operation",
+        "//tensorflow/c/eager:immediate_execution_tensor_handle",
+        "//tensorflow/c/eager:abstract_tensor_handle",
         "//tensorflow/core:framework",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:lib",
@@ -211,7 +216,7 @@ tf_cuda_library(
             "@com_google_absl//absl/strings",
             "@com_google_absl//absl/types:variant",
             "//tensorflow/c:tf_tensor_internal",
-            "//tensorflow/c/eager:tensor_handle_interface",
+            "//tensorflow/c/eager:immediate_execution_tensor_handle",
             "//tensorflow/core:core_cpu_lib",
             "//tensorflow/core:framework",
             "//tensorflow/core:framework_internal",
@@ -485,6 +490,7 @@ cc_library(
         ":eager_op_rewrite_registry",
         ":eager_operation",
         ":kernel_and_device",
+        ":placement_utils",
         ":tensor_handle",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:inlined_vector",
@@ -496,6 +502,8 @@ cc_library(
         "//tensorflow/c:tf_tensor_internal",
         "//tensorflow/compiler/jit:common",
         "//tensorflow/core/profiler/lib:traceme",
+        "//tensorflow/c/eager:abstract_function",
+        "//tensorflow/core/platform:errors",
     ] + select({
         "//tensorflow:android": [
             "//tensorflow/core:portable_tensorflow_lib_lite",
@@ -515,6 +523,35 @@ cc_library(
     }),
 )
 
+tf_cuda_library(
+    name = "placement_utils",
+    srcs = [
+        "placement_utils.cc",
+    ],
+    hdrs = [
+        "placement_utils.h",
+    ],
+    visibility = ["//tensorflow:internal"],
+    deps = [
+        ":context",
+        ":attr_builder",
+        ":eager_operation",
+        "//tensorflow/c/eager:immediate_execution_tensor_handle",
+    ] + select({
+        "//tensorflow:android": [
+            "//tensorflow/core:portable_tensorflow_lib_lite",
+        ],
+        "//conditions:default": [
+            "//tensorflow/core:core_cpu_lib",
+            "//tensorflow/core:framework",
+            "//tensorflow/core:framework_internal",
+            "//tensorflow/core:lib",
+            "//tensorflow/core:lib_internal",
+            "//tensorflow/core:protos_all_cc",
+        ],
+    }),
+)
+
 tf_cuda_library(
     name = "attr_builder",
     srcs = ["attr_builder.cc"],
diff --git a/tensorflow/core/common_runtime/eager/context.cc b/tensorflow/core/common_runtime/eager/context.cc
index 970c2bcbb89..7ca40fc6cf6 100644
--- a/tensorflow/core/common_runtime/eager/context.cc
+++ b/tensorflow/core/common_runtime/eager/context.cc
@@ -30,8 +30,6 @@ limitations under the License.
 
 #include "tensorflow/c/tf_tensor.h"
 #include "tensorflow/c/tf_tensor_internal.h"
-#include "tensorflow/c/eager/operation_interface.h"
-#include "tensorflow/c/eager/tensor_handle_interface.h"
 #include "tensorflow/core/common_runtime/collective_executor_mgr.h"
 #include "tensorflow/core/common_runtime/collective_param_resolver_local.h"
 #include "tensorflow/core/common_runtime/colocation_graph.h"
@@ -895,7 +893,7 @@ Status EagerContext::FindDeviceFromName(const char* device_name,
 }
 
 Status EagerContext::FindCompositeDeviceFromName(
-    const char* device_name, CompositeDevice** device) const {
+    StringPiece device_name, CompositeDevice** device) const {
   tf_shared_lock l(composite_devices_mu_);
   for (const auto& d : composite_devices_) {
     if (d.second->name() == device_name) {
@@ -941,8 +939,13 @@ Status EagerContext::RegisterCustomDevice(
 }
 
 Status EagerContext::FindOrCreateCompositeDevice(
-    const std::vector<string>& underlying_devices,
+    const std::vector<string>& underlying_devices, const string& device_name,
     CompositeDevice** composite_device) {
+  if (!device_name.empty() &&
+      FindCompositeDeviceFromName(device_name, composite_device).ok()) {
+    return Status::OK();
+  }
+
   const uint64 hash_key = Fingerprint64(absl::StrJoin(underlying_devices, ","));
 
   mutex_lock l(composite_devices_mu_);
@@ -953,11 +956,16 @@ Status EagerContext::FindOrCreateCompositeDevice(
   }
 
   Status s;
-  // Create a CompositeDevice on the same task as the host CPU, in order to
-  // trigger packed TensorHandle copy from a client to a remote worker.
-  auto device =
-      CompositeDevice::MakeDevice(underlying_devices, composite_devices_.size(),
-                                  HostCPU()->parsed_name(), &s);
+  std::unique_ptr<CompositeDevice> device;
+  if (device_name.empty()) {
+    // Create a CompositeDevice on the same task as the host CPU, in order to
+    // trigger packed TensorHandle copy from a client to a remote worker.
+    device = CompositeDevice::MakeDevice(underlying_devices,
+                                         composite_devices_.size(),
+                                         HostCPU()->parsed_name(), &s);
+  } else {
+    device = CompositeDevice::MakeDevice(underlying_devices, device_name, &s);
+  }
   TF_RETURN_IF_ERROR(s);
   *composite_device = device.get();
   pflr_->AddCompositeDevice(*composite_device);
diff --git a/tensorflow/core/common_runtime/eager/context.h b/tensorflow/core/common_runtime/eager/context.h
index cb6d09f8f1d..e6769279558 100644
--- a/tensorflow/core/common_runtime/eager/context.h
+++ b/tensorflow/core/common_runtime/eager/context.h
@@ -33,7 +33,7 @@ limitations under the License.
 
 #include "absl/types/optional.h"
 #include "absl/container/flat_hash_map.h"
-#include "tensorflow/c/eager/context_interface.h"
+#include "tensorflow/c/eager/immediate_execution_context.h"
 #include "tensorflow/core/common_runtime/composite_device.h"
 #include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
@@ -135,7 +135,7 @@ class CustomDevice {
 // TensorHandles may be placed either on custom or physical devices.
 using VariantDevice = absl::variant<Device*, CustomDevice*>;
 
-class EagerContext : public AbstractContextInterface, public core::RefCounted {
+class EagerContext : public ImmediateExecutionContext, public core::RefCounted {
  public:
   static constexpr uint64 kInvalidContextId = 0;
 
@@ -178,12 +178,14 @@ class EagerContext : public AbstractContextInterface, public core::RefCounted {
                                         MemoryReleaser memory_releaser,
                                         void* memory_releaser_arg) override;
 
-  AbstractTensorHandleInterface* CreateLocalHandle(
+  ImmediateExecutionTensorHandle* CreateLocalHandle(
       AbstractTensorInterface* t) override;
-  AbstractTensorHandleInterface* CopyTensorHandleToDevice(
-      AbstractTensorHandleInterface* handle, const char* device_name,
+  ImmediateExecutionTensorHandle* CopyTensorHandleToDevice(
+      ImmediateExecutionTensorHandle* handle, const char* device_name,
       Status* status) override;
-  AbstractOperationInterface* CreateOperation() override;
+  ImmediateExecutionOperation* CreateOperation() override;
+
+  Status RegisterFunction(AbstractFunction* f) override;
 
   bool UsesTFRT() override;
 
@@ -476,7 +478,7 @@ class EagerContext : public AbstractContextInterface, public core::RefCounted {
   // On mobile, it just cleans the caches.
   void WaitForAndCloseRemoteContexts();
 
-  bool PinSmallOpsToCPU() { return pin_small_ops_to_cpu_; }
+  bool PinSmallOpsToCPU() const { return pin_small_ops_to_cpu_; }
 
   tensorflow::Env* TFEnv() const { return env_; }
 
@@ -484,7 +486,7 @@ class EagerContext : public AbstractContextInterface, public core::RefCounted {
 
   Status FindDeviceFromName(const char* device_name, Device** device) const;
 
-  Status FindCompositeDeviceFromName(const char* device_name,
+  Status FindCompositeDeviceFromName(StringPiece device_name,
                                      CompositeDevice** device) const;
 
   Status FindCustomDeviceFromName(const string& device_name,
@@ -493,9 +495,10 @@ class EagerContext : public AbstractContextInterface, public core::RefCounted {
   Status RegisterCustomDevice(const string& name,
                               std::unique_ptr<CustomDevice> device);
 
-  // Find or create a composite device with the given `underlying_devices`.
+  // Find or create a composite device with the given `underlying_devices` and
+  // `device_name` (if not empty).
   Status FindOrCreateCompositeDevice(
-      const std::vector<string>& underlying_devices,
+      const std::vector<string>& underlying_devices, const string& device_name,
       CompositeDevice** composite_device);
 
   bool OnSameTask(const Device* first, const Device* second) const;
@@ -716,10 +719,23 @@ class EagerContext : public AbstractContextInterface, public core::RefCounted {
   std::function<void()> resource_deallocator_ = nullptr;
 };
 
-inline EagerContext* ContextFromInterface(AbstractContextInterface* context) {
+inline EagerContext* ContextFromInterface(ImmediateExecutionContext* context) {
   return down_cast<EagerContext*>(context);
 }
 
+namespace internal {
+struct EagerContextDeleter {
+  void operator()(EagerContext* p) const {
+    if (p != nullptr) {
+      p->Release();
+    }
+  }
+};
+}  // namespace internal
+
+using EagerContextPtr =
+    std::unique_ptr<EagerContext, internal::EagerContextDeleter>;
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_CONTEXT_H_
diff --git a/tensorflow/core/common_runtime/eager/context_test.cc b/tensorflow/core/common_runtime/eager/context_test.cc
index c6ed61c80c4..7f34884b4db 100644
--- a/tensorflow/core/common_runtime/eager/context_test.cc
+++ b/tensorflow/core/common_runtime/eager/context_test.cc
@@ -177,6 +177,7 @@ TEST_F(EagerContextTest, CompositeDevice) {
       "/job:worker/replica:0/task:0/device:CPU:1"};
   CompositeDevice* composite_device_0 = nullptr;
   TF_ASSERT_OK(context()->FindOrCreateCompositeDevice(underlying_devices,
+                                                      /*device_name=*/"",
                                                       &composite_device_0));
   EXPECT_EQ(composite_device_0->name(),
             "/job:localhost/replica:0/task:0/device:COMPOSITE:0");
@@ -186,11 +187,13 @@ TEST_F(EagerContextTest, CompositeDevice) {
   EXPECT_EQ(device, composite_device_0);
   CompositeDevice* composite_device_1 = nullptr;
   TF_ASSERT_OK(context()->FindOrCreateCompositeDevice(underlying_devices,
+                                                      /*device_name=*/"",
                                                       &composite_device_1));
   EXPECT_EQ(composite_device_1, composite_device_0);
   underlying_devices.push_back("/job:worker/replica:0/task:0/device:CPU:2");
   CompositeDevice* composite_device_2 = nullptr;
   TF_ASSERT_OK(context()->FindOrCreateCompositeDevice(underlying_devices,
+                                                      /*device_name=*/"",
                                                       &composite_device_2));
   EXPECT_EQ(composite_device_2->name(),
             "/job:localhost/replica:0/task:0/device:COMPOSITE:1");
@@ -202,5 +205,33 @@ TEST_F(EagerContextTest, CompositeDevice) {
       "/job:localhost/replica:0/task:0/device:COMPOSITE:2", &device)));
 }
 
+TEST_F(EagerContextTest, CompositeDeviceWithGivenName) {
+  InitContext(SessionOptions(), DEVICE_PLACEMENT_EXPLICIT);
+  const std::vector<string> underlying_devices_0 = {
+      "/job:worker/replica:0/task:0/device:CPU:0",
+      "/job:worker/replica:0/task:0/device:CPU:1"};
+  const string composite_device_name =
+      "/job:worker1/replica:0/task:0/device:COMPOSITE:5";
+  // Create a CompositeDevice with the given name.
+  CompositeDevice* composite_device_0 = nullptr;
+  TF_ASSERT_OK(context()->FindOrCreateCompositeDevice(
+      underlying_devices_0, composite_device_name, &composite_device_0));
+  EXPECT_EQ(composite_device_0->name(), composite_device_name);
+
+  CompositeDevice* device = nullptr;
+  TF_EXPECT_OK(
+      context()->FindCompositeDeviceFromName(composite_device_name, &device));
+  EXPECT_EQ(device, composite_device_0);
+
+  std::vector<string> underlying_devices_1 = {
+      "/job:worker/replica:0/task:0/device:CPU:1",
+      "/job:worker/replica:0/task:0/device:CPU:2"};
+  // Find a CompositeDevice with the given name.
+  CompositeDevice* composite_device_1 = nullptr;
+  TF_ASSERT_OK(context()->FindOrCreateCompositeDevice(
+      underlying_devices_1, composite_device_0->name(), &composite_device_1));
+  EXPECT_EQ(composite_device_1, composite_device_0);
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/eager/core.cc b/tensorflow/core/common_runtime/eager/core.cc
index e342f6ae6cd..77d2b665f5e 100644
--- a/tensorflow/core/common_runtime/eager/core.cc
+++ b/tensorflow/core/common_runtime/eager/core.cc
@@ -13,11 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/c/c_api_internal.h"
+#include "tensorflow/c/eager/abstract_function.h"
 #include "tensorflow/c/tf_tensor_internal.h"
 #include "tensorflow/core/common_runtime/eager/context.h"
 #include "tensorflow/core/common_runtime/eager/eager_operation.h"
 #include "tensorflow/core/common_runtime/eager/execute.h"
+#include "tensorflow/core/common_runtime/eager/placement_utils.h"
 #include "tensorflow/core/common_runtime/eager/tensor_handle.h"
+#include "tensorflow/core/platform/errors.h"
 
 namespace {
 
@@ -112,8 +115,8 @@ AbstractTensorInterface* TensorHandle::Resolve(Status* status) {
   }
 }
 
-AbstractTensorHandleInterface* EagerContext::CopyTensorHandleToDevice(
-    AbstractTensorHandleInterface* handle, const char* device_name,
+ImmediateExecutionTensorHandle* EagerContext::CopyTensorHandleToDevice(
+    ImmediateExecutionTensorHandle* handle, const char* device_name,
     Status* status) {
   TensorHandle* input = TensorHandleFromInterface(handle);
   TensorHandle* result = nullptr;
@@ -158,7 +161,7 @@ AbstractTensorHandleInterface* EagerContext::CopyTensorHandleToDevice(
 // here to a circular BUILD dep issue. If we move this to context.cc, then we
 // will have the circular dependency of:
 //   context -> tensor_handle -> remote_tensor_handle_data -> context
-AbstractTensorHandleInterface* EagerContext::CreateLocalHandle(
+ImmediateExecutionTensorHandle* EagerContext::CreateLocalHandle(
     AbstractTensorInterface* t) {
   Tensor tensor = TensorFromInterface(t);
   return TensorHandle::CreateLocalHandle(std::move(tensor), /*d=*/HostCPU(),
@@ -168,14 +171,44 @@ AbstractTensorHandleInterface* EagerContext::CreateLocalHandle(
 // TODO(b/152902651): We have to keep this function here since EagerOperation
 // depends on EagerContext. Thus, the context build target can't depend on
 // EagerOperation.
-AbstractOperationInterface* EagerContext::CreateOperation() {
+ImmediateExecutionOperation* EagerContext::CreateOperation() {
   return new EagerOperation(this);
 }
 
+Status EagerContext::RegisterFunction(AbstractFunction* f) {
+  FunctionDef* fdef;
+  TF_RETURN_IF_ERROR(f->GetFunctionDef(&fdef));
+  if (!fdef) {
+    return errors::InvalidArgument("GetFunctionDef returned nullptr.");
+  }
+  return AddFunctionDef(*fdef);
+}
+
 // TODO(b/152902651): Once we move many execute.cc functions into
 // eager_operation.cc we can avoid a circular dependency between them.
-Status EagerOperation::Execute(
-    absl::Span<AbstractTensorHandleInterface*> retvals, int* num_retvals) {
+Status EagerOperation::Execute(absl::Span<AbstractTensorHandle*> retvals,
+                               int* num_retvals) {
+  // Run eager placement logic.
+  VariantDevice device;
+  TF_RETURN_IF_ERROR(eager::MaybePinToCustomDevice(&device, *this));
+  if (device == kVariantDeviceNull) {
+    TF_RETURN_IF_ERROR(eager::MaybePinToResourceDevice(&device, *this));
+  }
+  if (device == kVariantDeviceNull) {
+    bool pin_to_cpu;
+    TF_RETURN_IF_ERROR(eager::MaybePinSmallOpsToCpu(
+        &pin_to_cpu, op_name(),
+        absl::MakeSpan(
+            reinterpret_cast<ImmediateExecutionTensorHandle**>(inputs_.data()),
+            inputs_.size()),
+        ctx_));
+    if (pin_to_cpu) {
+      device = ctx_.HostCPU();
+    }
+  }
+  if (device != kVariantDeviceNull) {
+    SetDevice(device);
+  }
   return EagerExecute(
       this, reinterpret_cast<tensorflow::TensorHandle**>(retvals.data()),
       num_retvals);
diff --git a/tensorflow/core/common_runtime/eager/eager_operation.cc b/tensorflow/core/common_runtime/eager/eager_operation.cc
index 090bfef46bd..073095e64d1 100644
--- a/tensorflow/core/common_runtime/eager/eager_operation.cc
+++ b/tensorflow/core/common_runtime/eager/eager_operation.cc
@@ -15,7 +15,9 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/eager/eager_operation.h"
 
 #include "absl/types/span.h"
-#include "tensorflow/c/eager/tensor_handle_interface.h"
+#include "tensorflow/c/eager/abstract_operation.h"
+#include "tensorflow/c/eager/abstract_tensor_handle.h"
+#include "tensorflow/c/eager/immediate_execution_tensor_handle.h"
 #include "tensorflow/c/tf_tensor_internal.h"
 #include "tensorflow/core/common_runtime/eager/attr_builder.h"
 #include "tensorflow/core/common_runtime/input_colocation_exemption_registry.h"
@@ -91,8 +93,8 @@ Status EagerOperation::SetAttrShape(const char* attr_name, const int64_t* dims,
   return Status::OK();
 }
 
-Status EagerOperation::SetAttrFunction(
-    const char* attr_name, const AbstractOperationInterface* value) {
+Status EagerOperation::SetAttrFunction(const char* attr_name,
+                                       const AbstractOperation* value) {
   AttrValue attr_value;
   NameAttrList* func = attr_value.mutable_func();
   func->set_name(value->Name());
@@ -194,8 +196,7 @@ Status EagerOperation::SetAttrShapeList(const char* attr_name,
 }
 
 Status EagerOperation::SetAttrFunctionList(
-    const char* attr_name,
-    absl::Span<const AbstractOperationInterface*> values) {
+    const char* attr_name, absl::Span<const AbstractOperation*> values) {
   size_t num_values = values.size();
   std::unique_ptr<NameAttrList[]> funcs(new NameAttrList[num_values]);
   for (int i = 0; i < num_values; i++) {
@@ -253,14 +254,13 @@ Status EagerOperation::OutputLength(const char* output_name, int* length) {
   return Status::OK();
 }
 
-Status EagerOperation::AddInput(AbstractTensorHandleInterface* input) {
+Status EagerOperation::AddInput(AbstractTensorHandle* input) {
   TensorHandle* h = TensorHandleFromInterface(input);
   AddTensorHandle(h);
   return MaybeInferSingleInputAttrs(h);
 }
 
-Status EagerOperation::AddInputList(
-    absl::Span<AbstractTensorHandleInterface*> inputs) {
+Status EagerOperation::AddInputList(absl::Span<AbstractTensorHandle*> inputs) {
   for (auto& input : inputs) {
     TensorHandle* h = TensorHandleFromInterface(input);
     AddTensorHandle(h);
diff --git a/tensorflow/core/common_runtime/eager/eager_operation.h b/tensorflow/core/common_runtime/eager/eager_operation.h
index 14268ef2630..fa245649d5c 100644
--- a/tensorflow/core/common_runtime/eager/eager_operation.h
+++ b/tensorflow/core/common_runtime/eager/eager_operation.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include "absl/types/optional.h"
 #include "absl/types/span.h"
 #include "absl/types/variant.h"
+#include "tensorflow/c/eager/abstract_tensor_handle.h"
 #include "tensorflow/core/common_runtime/eager/attr_builder.h"
 #include "tensorflow/core/common_runtime/eager/context.h"
 #include "tensorflow/core/common_runtime/eager/eager_executor.h"
@@ -31,7 +32,7 @@ limitations under the License.
 
 namespace tensorflow {
 
-class EagerOperation : public AbstractOperationInterface {
+class EagerOperation : public ImmediateExecutionOperation {
  public:
   explicit EagerOperation(tensorflow::EagerContext* ctx) : ctx_(*ctx) {}
   ~EagerOperation() override {
@@ -56,7 +57,7 @@ class EagerOperation : public AbstractOperationInterface {
   }
 
   // Replaces the previous device name with the given one (see
-  // AbstractOperationInterface::SetDeviceName for more details).
+  // AbstractOperation::SetDeviceName for more details).
   //
   // This also resets the internal device pointer, unless the given name refers
   // to a known custom device, in which case the internal device pointer is
@@ -76,10 +77,9 @@ class EagerOperation : public AbstractOperationInterface {
 
   Status SetAttrValue(const char* attr_name, const AttrValue& value);
 
-  Status AddInput(AbstractTensorHandleInterface* input) override;
-  Status AddInputList(
-      absl::Span<AbstractTensorHandleInterface*> inputs) override;
-  Status Execute(absl::Span<AbstractTensorHandleInterface*> retvals,
+  Status AddInput(AbstractTensorHandle* input) override;
+  Status AddInputList(absl::Span<AbstractTensorHandle*> inputs) override;
+  Status Execute(absl::Span<AbstractTensorHandle*> retvals,
                  int* num_retvals) override;
   const tensorflow::OpDef* OpDef() const override { return op_def_; };
 
@@ -92,7 +92,7 @@ class EagerOperation : public AbstractOperationInterface {
   Status SetAttrShape(const char* attr_name, const int64_t* dims,
                       const int num_dims) override;
   Status SetAttrFunction(const char* attr_name,
-                         const AbstractOperationInterface* value) override;
+                         const AbstractOperation* value) override;
   Status SetAttrFunctionName(const char* attr_name, const char* data,
                              size_t length) override;
   Status SetAttrTensor(const char* attr_name,
@@ -111,7 +111,7 @@ class EagerOperation : public AbstractOperationInterface {
                           const int* num_dims, int num_values) override;
   Status SetAttrFunctionList(
       const char* attr_name,
-      absl::Span<const AbstractOperationInterface*> values) override;
+      absl::Span<const AbstractOperation*> values) override;
 
   Status InputLength(const char* input_name, int* length) override;
   Status OutputLength(const char* output_name, int* length) override;
@@ -126,7 +126,7 @@ class EagerOperation : public AbstractOperationInterface {
   bool is_function() const { return is_function_; }
   bool colocation_exempt() const { return colocation_exempt_; }
 
-  tensorflow::EagerContext& EagerContext() { return ctx_; }
+  tensorflow::EagerContext& EagerContext() const { return ctx_; }
 
   AttrBuilder* MutableAttrs() { return &attrs_; }
   const AttrBuilder& Attrs() const { return attrs_; }
@@ -235,7 +235,7 @@ inline void EagerOperation::UpdateInput(int i, TensorHandle* h) {
 }
 
 inline EagerOperation* OperationFromInterface(
-    AbstractOperationInterface* operation) {
+    ImmediateExecutionOperation* operation) {
   return down_cast<EagerOperation*>(operation);
 }
 
diff --git a/tensorflow/core/common_runtime/eager/execute.cc b/tensorflow/core/common_runtime/eager/execute.cc
index 5be0ad8db57..a030f4d0356 100644
--- a/tensorflow/core/common_runtime/eager/execute.cc
+++ b/tensorflow/core/common_runtime/eager/execute.cc
@@ -415,7 +415,7 @@ Status GetOrCreateKernelAndDevice(
       TF_RETURN_IF_ERROR(GetDeviceForInput(ctx, input, &input_device));
       input_dev_ptrs.push_back(input_device);
       CompositeDevice* composite_device = nullptr;
-      if (ctx.FindCompositeDeviceFromName(input_device->name().c_str(),
+      if (ctx.FindCompositeDeviceFromName(input_device->name(),
                                           &composite_device)
               .ok()) {
         composite_devices[input_device->name()] =
@@ -432,13 +432,12 @@ Status GetOrCreateKernelAndDevice(
         // looking it up in ResourceMgr, which is slow). So we just get
         // resource_dtypes_and_shapes for all DT_RESOURCE inputs. If
         // resource_dtypes_and_shapes is not empty, take the first element.
-        TensorHandle::ResourceHandleInfo resource_handle_info;
-        TF_RETURN_IF_ERROR(input->GetResourceHandleInfo(&resource_handle_info));
-        std::vector<DtypeAndPartialTensorShape>* resource_dtypes_and_shapes =
-            &resource_handle_info.dtypes_and_shapes;
-        if (!resource_dtypes_and_shapes->empty()) {
+        std::vector<DtypeAndPartialTensorShape> resource_dtypes_and_shapes;
+        TF_RETURN_IF_ERROR(input->GetResourceHandleDtypesAndShapes(
+            &resource_dtypes_and_shapes));
+        if (!resource_dtypes_and_shapes.empty()) {
           const DtypeAndPartialTensorShape& dtype_and_shape =
-              resource_dtypes_and_shapes->at(0);
+              resource_dtypes_and_shapes.at(0);
           input_resource_variable_dtypes_and_shapes[i] = dtype_and_shape;
 
           // Add _Arg index, dtype and shape to "cache_key".
@@ -695,13 +694,8 @@ Status StoreResourceDtypesAndShapes(const eager::Operation& remote_op,
     TF_RETURN_IF_ERROR(attr_slice.Find("dtype", &dtype));
     const AttrValue* shape;
     TF_RETURN_IF_ERROR(attr_slice.Find("shape", &shape));
-    TensorHandle::ResourceHandleInfo resource_handle_info = {
-        {DtypeAndPartialTensorShape{dtype->type(), shape->shape()}}, {}};
-    // "allowed_devices" is set only when the output represents a
-    // per-replica/partitioned resource variable.
-    TryGetNodeAttr(attr_slice, "allowed_devices",
-                   &resource_handle_info.allowed_devices);
-    retvals[0]->SetResourceHandleInfo(std::move(resource_handle_info));
+    retvals[0]->SetResourceHandleDtypeAndShape(
+        {DtypeAndPartialTensorShape{dtype->type(), shape->shape()}});
   }
   return Status::OK();
 }
@@ -876,185 +870,6 @@ Status EagerRemoteExecute(EagerOperation* op, TensorHandle** retvals,
 }
 #endif  // IS_MOBILE_PLATFORM
 
-// These ops are not pinnable since they generate data. It can be slower to
-// generate and then copy the data instead of just generating the data on the
-// device directly.
-bool IsPinnableOp(const string& op_type) {
-  static const gtl::FlatSet<string>* unpinnable_ops = new gtl::FlatSet<string>({
-      "RandomUniform",
-      "RandomUniformInt",
-      "RandomStandardNormal",
-      "StatelessRandomUniform",
-      "StatelessRandomUniformInt",
-      "StatelessRandomUniformFullInt",
-      "StatelessRandomNormal",
-  });
-
-  // XRT ops refer to per-device handles that are not safe to move between
-  // devices.
-  return unpinnable_ops->find(op_type) == unpinnable_ops->end() &&
-         !absl::StartsWith(op_type, "XRT");
-}
-
-// Validate if the remote device with the given incarnation is valid in the
-// remote device manager of the current eager context.
-Status ValidateTensorHandleRemoteDevice(EagerContext* ctx,
-                                        int64 device_incarnation) {
-  if (ctx->remote_device_mgr()->ContainsDevice(device_incarnation)) {
-    return Status::OK();
-  }
-  return errors::InvalidArgument(
-      "Resource input tensor contains an invalid device. This might happen "
-      "when the client has connected to a different cluster, or some remote "
-      "workers have been restarted.");
-}
-
-// The Op device may be updated if:
-// - A resource touching input is specified: all resource-touching ops run in
-// the device the resource is, regardless of anything else that has been
-// specified. This is identical to the graph mode behavior.
-//
-// - All op inputs are on the CPU, small (<64 elements) and integers
-// (int32/int64). This can be disabled by setting the environment variable
-// "TF_EAGER_ENABLE_SMALL_TENSOR_CPU_PINNING" to "0" or "false".
-//
-// TODO(b/154234908): Unify placement logic.
-Status MaybeUpdateOpDevice(EagerOperation* op) {
-  // If operation was already placed on a custom device, use it.
-  if (VariantDeviceIsCustom(op->Device())) {
-    return Status::OK();
-  }
-
-  // If all the inputs are on the same custom device, use that custom
-  // device. Otherwise, it is an error to have a custom device as an input.
-  if (!op->Inputs().empty()) {
-    // We keep track of what we've seen with devices instead of booleans to be
-    // able to provide a meaningful error message below.
-    VariantDevice first = op->Inputs()[0]->device();
-    VariantDevice different = first;  // A different input device, if any.
-    VariantDevice custom = first;     // The first custom device seen, or an
-                                      // arbitrary non-custom device otherwise.
-    for (size_t i = 1; first == different && i < op->Inputs().size(); ++i) {
-      VariantDevice device = op->Inputs()[i]->device();
-      if (device != first) {
-        different = device;
-      }
-      if (!VariantDeviceIsCustom(custom) && VariantDeviceIsCustom(device)) {
-        custom = device;
-      }
-      if (different != first && VariantDeviceIsCustom(custom)) {
-        return errors::InvalidArgument(absl::StrCat(
-            "If an operation has one of its inputs in a custom device, then "
-            "all inputs should be on that same device. Operation ",
-            op->Name(), " has one input in custom device ",
-            VariantDeviceName(custom),
-            " and at least one input in a different device ",
-            VariantDeviceName(custom == first ? different : first)));
-      }
-    }
-    if (different == first && VariantDeviceIsCustom(custom)) {
-      op->SetDevice(first);
-      return Status::OK();
-    }
-  }
-
-  if (op->colocation_exempt()) {
-    return Status::OK();
-  }
-  EagerContext& ctx = op->EagerContext();
-  bool all_inputs_eligible_for_cpu_pinning =
-      ctx.PinSmallOpsToCPU() && !op->is_function() && IsPinnableOp(op->Name());
-  Device* op_device = op->Device() == kVariantDeviceNull
-                          ? ctx.HostCPU()
-                          : absl::get<Device*>(op->Device());
-  for (int i = 0; i < op->Inputs().size(); ++i) {
-    TensorHandle* tensor_handle = op->Inputs()[i];
-    if (tensor_handle->dtype == DT_RESOURCE) {
-      if (tensor_handle->resource_remote_device_incarnation() != 0) {
-        TF_RETURN_IF_ERROR(ValidateTensorHandleRemoteDevice(
-            &ctx, tensor_handle->resource_remote_device_incarnation()));
-      }
-      Device* resource_device = tensor_handle->resource_device();
-      DVLOG(2) << "for op " << op->Name() << " input " << i << " "
-               << DataTypeString(tensor_handle->dtype)
-               << " input device = " << resource_device->name()
-               << ", op device = " << op_device->name();
-      // We check for `op->Device() == nullptr` because it can be later
-      // interpreted as unspecified device and a different device can
-      // be selected based on device priority. If any input to an op
-      // is a resource we must pin it to prevent different device selection.
-      // TODO(iga): null device can mean "unspecified" or "CPU". Clean this up.
-      if (resource_device != op_device || op->Device() == kVariantDeviceNull) {
-        std::vector<string> allowed_devices;
-        TF_RETURN_IF_ERROR(
-            tensor_handle->GetResourceAllowedDevices(&allowed_devices));
-        if (!allowed_devices.empty()) {
-          // TODO(b/145922293): Support allowed_devices specified in wildcard
-          // patterns.
-          if (std::find(allowed_devices.begin(), allowed_devices.end(),
-                        op->DeviceName()) != allowed_devices.end()) {
-            TF_RETURN_IF_ERROR(ctx.FindDeviceFromName(op->DeviceName().c_str(),
-                                                      &resource_device));
-          }
-        }
-        DVLOG(1) << (resource_device != op_device ? "Changing " : "Setting ")
-                 << "device of operation " << op->Name() << " to "
-                 << resource_device->name() << " because input #" << i
-                 << " is a resource in this device.";
-        op->SetDevice(resource_device);
-      }
-      all_inputs_eligible_for_cpu_pinning = false;
-      // No point in looking at other inputs. If there are other resources,
-      // they must have the same device and we already declared the op to be
-      // ineligible for CPU pinning.
-      break;
-    } else if (all_inputs_eligible_for_cpu_pinning) {
-      auto input_device_variant = tensor_handle->DeviceOrHostCPU(ctx);
-      if (VariantDeviceIsCustom(input_device_variant)) {
-        all_inputs_eligible_for_cpu_pinning = false;
-        continue;
-      }
-      Device* input_device = absl::get<Device*>(input_device_variant);
-      DVLOG(2) << "for op " << op->Name() << " input " << i << " "
-               << DataTypeString(tensor_handle->dtype)
-               << " input device = " << input_device->name()
-               << ", op device = " << op_device->name();
-
-      // Input is on CPU.
-      if (input_device != ctx.HostCPU()) {
-        all_inputs_eligible_for_cpu_pinning = false;
-        continue;
-      }
-
-      if (tensor_handle->dtype != DataType::DT_INT32 &&
-          tensor_handle->dtype != DataType::DT_INT64) {
-        all_inputs_eligible_for_cpu_pinning = false;
-        continue;
-      }
-
-      int64 num_elements;
-      TF_RETURN_IF_ERROR(tensor_handle->NumElements(&num_elements));
-      if (num_elements > 64) {
-        all_inputs_eligible_for_cpu_pinning = false;
-      }
-    }
-  }
-
-  // Ops without inputs are usually ops that generate a tensor in some way and
-  // usually require being present on whatever device they are scheduled on
-  // - for e.g. VarHandleOp or _Recv).
-  // TODO(nareshmodi): Is it possible there is no int32/int64 CPU kernel for
-  // an op, but there is a GPU kernel?
-  if (!op->Inputs().empty() && all_inputs_eligible_for_cpu_pinning) {
-    DVLOG(1) << "Forcing op " << op->Name()
-             << " to be on the CPU since all input tensors have an "
-                "int32/int64 dtype, and are small (less than 64 elements).";
-    op->SetDevice(ctx.HostCPU());
-  }
-
-  return Status::OK();
-}
-
 Status GetKernelOutputs(std::vector<Tensor>* outputs, int num_outputs,
                         TensorHandle** retvals, EagerContext* ctx,
                         KernelAndDevice* kernel) {
@@ -1117,8 +932,6 @@ Status EagerExecute(EagerOperation* op, TensorHandle** retvals,
       [&] { return absl::StrCat("EagerExecute: ", op->Name()); },
       profiler::TraceMeLevel::kInfo);
 
-  TF_RETURN_IF_ERROR(MaybeUpdateOpDevice(op));
-
   if (VariantDeviceIsCustom(op->Device())) {
     return absl::get<CustomDevice*>(op->Device())
         ->Execute(op, retvals, num_retvals);
diff --git a/tensorflow/core/common_runtime/eager/placement_utils.cc b/tensorflow/core/common_runtime/eager/placement_utils.cc
new file mode 100644
index 00000000000..8898516612f
--- /dev/null
+++ b/tensorflow/core/common_runtime/eager/placement_utils.cc
@@ -0,0 +1,228 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/common_runtime/eager/placement_utils.h"
+
+#include "tensorflow/c/eager/immediate_execution_tensor_handle.h"
+#include "tensorflow/core/common_runtime/eager/attr_builder.h"
+#include "tensorflow/core/common_runtime/eager/eager_operation.h"
+#include "tensorflow/core/common_runtime/input_colocation_exemption_registry.h"
+#include "tensorflow/core/framework/op_def.pb.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/platform/errors.h"
+
+namespace tensorflow {
+namespace eager {
+
+// These ops are not pinnable since they generate data. It can be slower to
+// generate and then copy the data instead of just generating the data on the
+// device directly.
+static bool IsPinnableOp(StringPiece op_name) {
+  static const gtl::FlatSet<string>* unpinnable_ops = new gtl::FlatSet<string>({
+      "RandomUniform",
+      "RandomUniformInt",
+      "RandomStandardNormal",
+      "StatelessRandomUniform",
+      "StatelessRandomUniformInt",
+      "StatelessRandomUniformFullInt",
+      "StatelessRandomNormal",
+  });
+
+  // XRT ops refer to per-device handles that are not safe to move between
+  // devices.
+  return unpinnable_ops->find(string(op_name)) == unpinnable_ops->end() &&
+         !absl::StartsWith(op_name, "XRT");
+}
+// Validate if the remote device with the given incarnation is valid in the
+// remote device manager of the current eager context.
+static Status ValidateTensorHandleRemoteDevice(EagerContext* ctx,
+                                               int64 device_incarnation) {
+  if (ctx->remote_device_mgr()->ContainsDevice(device_incarnation)) {
+    return Status::OK();
+  }
+  return errors::InvalidArgument(
+      "Resource input tensor contains an invalid device. This might happen "
+      "when the client has connected to a different cluster, or some remote "
+      "workers have been restarted.");
+}
+
+bool IsColocationExempt(StringPiece op_name) {
+  const auto& exempt_ops = InputColocationExemptionRegistry::Global()->Get();
+  return exempt_ops.find(string(op_name)) != exempt_ops.end();
+}
+
+bool IsFunction(StringPiece op_name) {
+  const OpDef* op_def = nullptr;
+  Status s = OpDefForOp(string(op_name), &op_def);
+  if (!s.ok()) {
+    if (!errors::IsNotFound(s)) {
+      LOG(WARNING) << "Looking up OpDef failed with error: " << s.ToString();
+    }
+    // Cannot find OpDef, it is a function.
+    return true;
+  }
+  return false;
+}
+
+bool IsCustomDevice(StringPiece device_name, const EagerContext& ctx) {
+  CustomDevice* custom_device;
+  return ctx.FindCustomDeviceFromName(string(device_name), &custom_device).ok();
+}
+
+Status MaybePinSmallOpsToCpu(bool* result, StringPiece op_name,
+                             absl::Span<ImmediateExecutionTensorHandle*> args,
+                             const EagerContext& ctx) {
+  if (!ctx.PinSmallOpsToCPU() || IsFunction(op_name) ||
+      IsColocationExempt(op_name) || !IsPinnableOp(op_name)) {
+    *result = false;
+    return Status::OK();
+  }
+
+  // Ops without inputs are usually ops that generate a tensor in some way and
+  // usually require being present on whatever device they are scheduled on
+  // - for e.g. VarHandleOp or _Recv).
+  if (args.empty()) {
+    *result = false;
+    return Status::OK();
+  }
+
+  int i = 0;
+  for (auto* arg : args) {
+    Status s;
+    const char* device_name = arg->DeviceName(&s);
+    DataType dtype = arg->DataType();
+    TF_RETURN_IF_ERROR(s);
+    if (IsCustomDevice(device_name, ctx)) {
+      *result = false;
+      return Status::OK();
+    }
+
+    DVLOG(2) << "for op " << op_name << " input " << i << " "
+             << DataTypeString(dtype) << " input device = " << device_name;
+
+    // Input is on CPU.
+    if (device_name != ctx.HostCPU()->name()) {
+      *result = false;
+      return Status::OK();
+    }
+
+    if (dtype != DataType::DT_INT32 && dtype != DataType::DT_INT64) {
+      *result = false;
+      return Status::OK();
+    }
+
+    int64 num_elements;
+    TF_RETURN_IF_ERROR(arg->NumElements(&num_elements));
+    if (num_elements > 64) {
+      *result = false;
+      return Status::OK();
+    }
+    i++;
+  }
+
+  // TODO(nareshmodi): Is it possible there is no int32/int64 CPU kernel for
+  // an op, but there is a GPU kernel?
+  DVLOG(1) << "Forcing op " << op_name
+           << " to be on the CPU since all input tensors have an "
+              "int32/int64 dtype, and are small (less than 64 elements).";
+  *result = true;
+  return Status::OK();
+}
+
+Status MaybePinToResourceDevice(VariantDevice* device,
+                                const EagerOperation& op) {
+  if (op.colocation_exempt()) {
+    return Status::OK();
+  }
+  EagerContext& ctx = op.EagerContext();
+  Device* op_device = op.Device() == kVariantDeviceNull
+                          ? ctx.HostCPU()
+                          : absl::get<Device*>(op.Device());
+  for (int i = 0; i < op.Inputs().size(); ++i) {
+    TensorHandle* tensor_handle = op.Inputs()[i];
+    if (tensor_handle->dtype == DT_RESOURCE) {
+      if (tensor_handle->resource_remote_device_incarnation() != 0) {
+        TF_RETURN_IF_ERROR(ValidateTensorHandleRemoteDevice(
+            &ctx, tensor_handle->resource_remote_device_incarnation()));
+      }
+      Device* resource_device = tensor_handle->resource_device();
+      DVLOG(2) << "for op " << op.Name() << " input " << i << " "
+               << DataTypeString(tensor_handle->dtype)
+               << " input device = " << resource_device->name()
+               << ", op device = " << op_device->name();
+      // We check for `op->Device() == nullptr` because it can be later
+      // interpreted as unspecified device and a different device can
+      // be selected based on device priority. If any input to an op
+      // is a resource we must pin it to prevent different device selection.
+      // TODO(iga): null device can mean "unspecified" or "CPU". Clean this up.
+      if (resource_device != op_device || op.Device() == kVariantDeviceNull) {
+        DVLOG(1) << (resource_device != op_device ? "Changing " : "Setting ")
+                 << "device of operation " << op.Name() << " to "
+                 << resource_device->name() << " because input #" << i
+                 << " is a resource in this device.";
+        *device = resource_device;
+        return Status::OK();
+        // No point in looking at other inputs. If there are other resources,
+        // they must have the same device and we already declared the op to be
+        // ineligible for CPU pinning.
+      }
+    }
+  }
+  return Status::OK();
+}
+
+Status MaybePinToCustomDevice(VariantDevice* device, const EagerOperation& op) {
+  // If operation was already placed on a custom device, use it.
+  if (VariantDeviceIsCustom(op.Device())) {
+    *device = op.Device();
+    return Status::OK();
+  }
+
+  if (!op.Inputs().empty()) {
+    // We keep track of what we've seen with devices instead of booleans to be
+    // able to provide a meaningful error message below.
+    VariantDevice first = op.Inputs()[0]->device();
+    VariantDevice different = first;  // A different input device, if any.
+    VariantDevice custom = first;     // The first custom device seen, or an
+                                      // arbitrary non-custom device otherwise.
+    for (size_t i = 1; first == different && i < op.Inputs().size(); ++i) {
+      VariantDevice device = op.Inputs()[i]->device();
+      if (device != first) {
+        different = device;
+      }
+      if (!VariantDeviceIsCustom(custom) && VariantDeviceIsCustom(device)) {
+        custom = device;
+      }
+      if (different != first && VariantDeviceIsCustom(custom)) {
+        return errors::InvalidArgument(absl::StrCat(
+            "If an operation has one of its inputs in a custom device, then "
+            "all inputs should be on that same device. Operation ",
+            op.Name(), " has one input in custom device ",
+            VariantDeviceName(custom),
+            " and at least one input in a different device ",
+            VariantDeviceName(custom == first ? different : first)));
+      }
+    }
+    if (different == first && VariantDeviceIsCustom(custom)) {
+      *device = first;
+      return Status::OK();
+    }
+  }
+
+  return Status::OK();
+}
+
+}  // namespace eager
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/eager/placement_utils.h b/tensorflow/core/common_runtime/eager/placement_utils.h
new file mode 100644
index 00000000000..d58bd304b27
--- /dev/null
+++ b/tensorflow/core/common_runtime/eager/placement_utils.h
@@ -0,0 +1,55 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_PLACEMENT_UTILS_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_PLACEMENT_UTILS_H_
+
+#include "tensorflow/core/common_runtime/eager/context.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/stringpiece.h"
+
+namespace tensorflow {
+namespace eager {
+
+bool IsColocationExempt(StringPiece op_name);
+
+bool IsFunction(StringPiece op_name);
+
+bool IsCustomDevice(StringPiece device_name, const EagerContext& ctx);
+
+// TODO(b/154234908): Unify placement logic.
+// TODO(b/159647422): Add C++ unit tests for placement logic.
+
+// Pin the op to cpu if all op inputs are on the CPU, small (<64 elements) and
+// integers (int32/int64). This can be disabled by setting the environment
+// variable "TF_EAGER_ENABLE_SMALL_TENSOR_CPU_PINNING" to "0" or "false".
+Status MaybePinSmallOpsToCpu(bool* result, StringPiece op_name,
+                             absl::Span<ImmediateExecutionTensorHandle*> args,
+                             const EagerContext& ctx);
+
+// If a resource touching input is specified, all resource-touching ops run in
+// the device the resource is, regardless of anything else that has been
+// specified. This is identical to the graph mode behavior.
+Status MaybePinToResourceDevice(VariantDevice* device,
+                                const EagerOperation& op);
+
+// If all the inputs are on the same custom device, use that custom
+// device. Otherwise, it is an error to have a custom device as an input.
+Status MaybePinToCustomDevice(VariantDevice* device, const EagerOperation& op);
+
+}  // namespace eager
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_PLACEMENT_UTILS_H_
diff --git a/tensorflow/core/common_runtime/eager/tensor_handle.cc b/tensorflow/core/common_runtime/eager/tensor_handle.cc
index 9b82c556cd0..0cd55959924 100644
--- a/tensorflow/core/common_runtime/eager/tensor_handle.cc
+++ b/tensorflow/core/common_runtime/eager/tensor_handle.cc
@@ -145,13 +145,13 @@ Status TensorHandle::PackedTensorHandleData::ExtractPackedHandle(
   return Status::OK();
 }
 
-void TensorHandle::SetResourceHandleInfo(
-    ResourceHandleInfo&& resource_handle_info) {
-  resource_handle_info_ = std::move(resource_handle_info);
+void TensorHandle::SetResourceHandleDtypeAndShape(
+    std::vector<DtypeAndPartialTensorShape> dtypes_and_shapes) {
+  handle_dtypes_and_shapes_ = std::move(dtypes_and_shapes);
 }
 
-Status TensorHandle::GetResourceHandleInfoImpl(
-    std::function<void()> set_resource_info) {
+Status TensorHandle::GetResourceHandleDtypesAndShapes(
+    std::vector<DtypeAndPartialTensorShape>* result) {
   if (dtype != DT_RESOURCE) {
     return errors::InvalidArgument(
         "TensorHandle::GetResourceDtypeAndShape should be called on tensor "
@@ -160,7 +160,7 @@ Status TensorHandle::GetResourceHandleInfoImpl(
   }
 
   if (Type() != LOCAL) {
-    set_resource_info();
+    *result = handle_dtypes_and_shapes_;
     return Status::OK();
   }
 
@@ -170,32 +170,10 @@ Status TensorHandle::GetResourceHandleInfoImpl(
   auto& data = absl::get<LocalTensorHandleData>(data_);
   TF_RETURN_IF_ERROR(data.WaitReady("TensorHandle::GetResourceHandleInfo"));
 
-  set_resource_info();
+  *result = handle_dtypes_and_shapes_;
   return Status::OK();
 }
 
-Status TensorHandle::GetResourceHandleInfo(ResourceHandleInfo* result) {
-  auto get_resource_info = [result, this]() {
-    *result = resource_handle_info_;
-  };
-  return GetResourceHandleInfoImpl(get_resource_info);
-}
-
-Status TensorHandle::GetResourceHandleDtypesAndShapes(
-    std::vector<DtypeAndPartialTensorShape>* result) {
-  auto get_resource_info = [result, this]() {
-    *result = resource_handle_info_.dtypes_and_shapes;
-  };
-  return GetResourceHandleInfoImpl(get_resource_info);
-}
-
-Status TensorHandle::GetResourceAllowedDevices(std::vector<string>* result) {
-  auto get_resource_info = [result, this]() {
-    *result = resource_handle_info_.allowed_devices;
-  };
-  return GetResourceHandleInfoImpl(get_resource_info);
-}
-
 int TensorHandle::NumPackedHandles() const {
   if (Type() != PACKED) {
     return 0;
@@ -270,9 +248,8 @@ TensorHandle::TensorHandle(tensorflow::Tensor&& t, Device* d, Device* op_device,
       resource_remote_device_incarnation_(
           GetRemoteDeviceIncarnation(resource_device_)),
       ctx_(ctx),
-      resource_handle_info_(
-          {t.flat<class ResourceHandle>()(0).dtypes_and_shapes(),
-           t.flat<class ResourceHandle>()(0).allowed_devices()}),
+      handle_dtypes_and_shapes_(
+          t.flat<class ResourceHandle>()(0).dtypes_and_shapes()),
       data_(absl::in_place_type<LocalTensorHandleData>, std::move(t)) {
   DVLOG(3) << "Creating Local TensorHandle: " << this
            << " device: " << VariantDeviceDebugString(device_)
@@ -320,16 +297,17 @@ TensorHandle::TensorHandle(Device* d, Device* op_device,
 Status TensorHandle::CreatePackedHandle(std::vector<TensorHandle*>&& handles,
                                         const tensorflow::DataType dtype,
                                         const tensorflow::TensorShape& shape,
+                                        const string& device_name,
                                         EagerContext* ctx,
                                         TensorHandle** packed_handle) {
   if (handles.empty()) {
     return errors::InvalidArgument("Handles should not be empty.");
   }
 
-  ResourceHandleInfo resource_handle_info;
+  std::vector<DtypeAndPartialTensorShape> dtypes_and_shapes;
   if (dtype == DT_RESOURCE) {
     TF_RETURN_IF_ERROR(
-        handles.at(0)->GetResourceHandleInfo(&resource_handle_info));
+        handles.at(0)->GetResourceHandleDtypesAndShapes(&dtypes_and_shapes));
   }
   std::vector<string> devices;
   for (auto* handle : handles) {
@@ -343,11 +321,12 @@ Status TensorHandle::CreatePackedHandle(std::vector<TensorHandle*>&& handles,
   }
 
   CompositeDevice* composite_device = nullptr;
-  TF_RETURN_IF_ERROR(
-      ctx->FindOrCreateCompositeDevice(devices, &composite_device));
+  TF_RETURN_IF_ERROR(ctx->FindOrCreateCompositeDevice(devices, device_name,
+                                                      &composite_device));
   *packed_handle =
       new TensorHandle(std::move(handles), composite_device, dtype, shape, ctx);
-  (*packed_handle)->SetResourceHandleInfo(std::move(resource_handle_info));
+  (*packed_handle)
+      ->SetResourceHandleDtypeAndShape(std::move(dtypes_and_shapes));
   return Status::OK();
 }
 
@@ -363,8 +342,8 @@ Status TensorHandle::CreatePackedHandle(std::vector<TensorHandle*>&& handles,
   tensorflow::DataType dtype = handles.at(0)->dtype;
   tensorflow::TensorShape shape;
   TF_RETURN_IF_ERROR(handles.at(0)->Shape(&shape));
-  return CreatePackedHandle(std::move(handles), dtype, shape, ctx,
-                            packed_handle);
+  return CreatePackedHandle(std::move(handles), dtype, shape,
+                            /*device_name*/ "", ctx, packed_handle);
 }
 
 TensorHandle::TensorHandle(std::vector<TensorHandle*>&& handles, Device* device,
@@ -897,8 +876,7 @@ Status TensorHandle::SetTensor(tensorflow::Tensor&& t, const Device* d) {
 
     if (t.dtype() == DT_RESOURCE && t.NumElements() > 0) {
       auto& resource_handle = t.flat<class ResourceHandle>()(0);
-      resource_handle_info_ = {resource_handle.dtypes_and_shapes(),
-                               resource_handle.allowed_devices()};
+      handle_dtypes_and_shapes_ = resource_handle.dtypes_and_shapes();
     }
     auto& data = absl::get<LocalTensorHandleData>(data_);
     return data.SetTensor(std::move(t));
@@ -1071,7 +1049,7 @@ const char* TensorHandle::BackingDeviceName(Status* status) const {
   }
 }
 
-tensorflow::AbstractTensorHandleInterface* TensorHandle::Copy() {
+tensorflow::ImmediateExecutionTensorHandle* TensorHandle::Copy() {
   Ref();
   return this;
 }
diff --git a/tensorflow/core/common_runtime/eager/tensor_handle.h b/tensorflow/core/common_runtime/eager/tensor_handle.h
index 5e7638ae03c..8ef482cd82c 100644
--- a/tensorflow/core/common_runtime/eager/tensor_handle.h
+++ b/tensorflow/core/common_runtime/eager/tensor_handle.h
@@ -31,7 +31,7 @@ limitations under the License.
 // clang-format on
 
 #include "absl/types/variant.h"
-#include "tensorflow/c/eager/tensor_handle_interface.h"
+#include "tensorflow/c/eager/immediate_execution_tensor_handle.h"
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/eager/eager_executor.h"
 #include "tensorflow/core/common_runtime/eager/tensor_handle_data.h"
@@ -53,7 +53,7 @@ class EagerContext;
 // Associates a Tensor and a Device, used in the eager runtime. Internal version
 // of the TFE_TensorHandle struct and the python EagerTensor class
 // (unrelated to python TensorHandle).
-class TensorHandle : public AbstractTensorHandleInterface,
+class TensorHandle : public ImmediateExecutionTensorHandle,
                      public core::RefCounted {
   // TensorHandle for dtype != DT_RESOURCE
   TensorHandle(tensorflow::Tensor&& t, Device* d, Device* op_device,
@@ -94,7 +94,7 @@ class TensorHandle : public AbstractTensorHandleInterface,
   static Status CreatePackedHandle(std::vector<TensorHandle*>&& handles,
                                    const tensorflow::DataType dtype,
                                    const tensorflow::TensorShape& shape,
-                                   EagerContext* ctx,
+                                   const string& device_name, EagerContext* ctx,
                                    TensorHandle** packed_handle);
   static Status CreatePackedHandle(std::vector<TensorHandle*>&& handles,
                                    EagerContext* ctx,
@@ -121,7 +121,7 @@ class TensorHandle : public AbstractTensorHandleInterface,
   const char* BackingDeviceName(Status* status) const override;
   AbstractTensorInterface* Resolve(Status* status) override;
 
-  AbstractTensorHandleInterface* Copy() override;
+  ImmediateExecutionTensorHandle* Copy() override;
 
   // Return the Tensor from the default device.
   Status Tensor(const tensorflow::Tensor** t) const;
@@ -226,19 +226,13 @@ class TensorHandle : public AbstractTensorHandleInterface,
 
   string DebugString() const;
 
-  struct ResourceHandleInfo {
-    std::vector<DtypeAndPartialTensorShape> dtypes_and_shapes;
-    std::vector<string> allowed_devices;
-  };
-
-  void SetResourceHandleInfo(ResourceHandleInfo&& resource_handle_info);
+  void SetResourceHandleDtypeAndShape(
+      std::vector<DtypeAndPartialTensorShape> dtypes_and_shapes);
 
   // If this TensorHandle is 1) a local tensor, and 2) a resource handle,
-  // return data types, shapes and allowed devices of the underlying resource.
-  Status GetResourceHandleInfo(ResourceHandleInfo* result);
+  // return data types and shapes of the underlying resource.
   Status GetResourceHandleDtypesAndShapes(
       std::vector<DtypeAndPartialTensorShape>* result);
-  Status GetResourceAllowedDevices(std::vector<string>* result);
 
   // Returns the number of packed handles. 0 if the handle type is not PACKED.
   int NumPackedHandles() const;
@@ -261,8 +255,6 @@ class TensorHandle : public AbstractTensorHandleInterface,
   // with a ready version of the tensor handle data.
   bool IsReady() const;
 
-  Status GetResourceHandleInfoImpl(std::function<void()> set_resource_info);
-
   VariantDevice const device_;
 
   // Device in which the op producing this tensor was executed. Equals to
@@ -308,9 +300,9 @@ class TensorHandle : public AbstractTensorHandleInterface,
   Status is_poisoned_;
 
   // If this TensorHandle 1) is a local tensor, and 2) is a resource handle or
-  // refers to a remote resource handle, we store data types, shapes and allowed
-  // devices for the underlying resource.
-  ResourceHandleInfo resource_handle_info_;
+  // refers to a remote resource handle, we store data types and shapes for
+  // the underlying resource.
+  std::vector<DtypeAndPartialTensorShape> handle_dtypes_and_shapes_;
 
   // A handle data which refers to multiple TensorHandles of the same dtype and
   // shape.
@@ -372,12 +364,12 @@ const VariantDevice kVariantDeviceNull = static_cast<Device*>(nullptr);
 // Returns the device backing the resource. Else, returns nullptr.
 Device* GetResourceDevice(const ResourceHandle& handle, EagerContext* ctx);
 
-class TensorHandleInterface : public AbstractTensorHandleInterface {
+class TensorHandleInterface : public ImmediateExecutionTensorHandle {
  public:
 };
 
-inline TensorHandle* TensorHandleFromInterface(
-    AbstractTensorHandleInterface* handle) {
+template <typename T>
+inline TensorHandle* TensorHandleFromInterface(T* handle) {
   return down_cast<TensorHandle*>(handle);
 }
 
diff --git a/tensorflow/core/common_runtime/eager/tensor_handle_test.cc b/tensorflow/core/common_runtime/eager/tensor_handle_test.cc
index 28092c0a604..40cec3fcc49 100644
--- a/tensorflow/core/common_runtime/eager/tensor_handle_test.cc
+++ b/tensorflow/core/common_runtime/eager/tensor_handle_test.cc
@@ -150,13 +150,13 @@ TEST_F(PackedTensorHandleTest, PackedHandle) {
   Device* d0 = ListDevices().at(0);
   TensorHandle* h0 =
       TensorHandle::CreateLocalHandle(std::move(t0), d0, d0, d0, context());
-  h0->SetResourceHandleInfo({{dtype_and_shape}, {}});
+  h0->SetResourceHandleDtypeAndShape({dtype_and_shape});
   handles.push_back(h0);
   Tensor t1(dtype, shape);
   Device* d1 = ListDevices().at(1);
   TensorHandle* h1 =
       TensorHandle::CreateLocalHandle(std::move(t1), d1, d1, d1, context());
-  h1->SetResourceHandleInfo({{dtype_and_shape}, {}});
+  h1->SetResourceHandleDtypeAndShape({dtype_and_shape});
   handles.push_back(h1);
 
   // Create 2 remote TensorHandles (not ready).
@@ -185,13 +185,12 @@ TEST_F(PackedTensorHandleTest, PackedHandle) {
   TensorShape packed_shape;
   TF_ASSERT_OK(packed_handle->Shape(&packed_shape));
   EXPECT_EQ(packed_shape, shape);
-  TensorHandle::ResourceHandleInfo resource_handle_info;
-  TF_ASSERT_OK(packed_handle->GetResourceHandleInfo(&resource_handle_info));
-  EXPECT_EQ(resource_handle_info.dtypes_and_shapes.size(), 1);
-  EXPECT_EQ(resource_handle_info.dtypes_and_shapes.at(0).dtype, DT_FLOAT);
-  EXPECT_EQ(
-      resource_handle_info.dtypes_and_shapes.at(0).shape.IsIdenticalTo({2, 2}),
-      true);
+  std::vector<DtypeAndPartialTensorShape> dtypes_and_shapes;
+  TF_ASSERT_OK(
+      packed_handle->GetResourceHandleDtypesAndShapes(&dtypes_and_shapes));
+  EXPECT_EQ(dtypes_and_shapes.size(), 1);
+  EXPECT_EQ(dtypes_and_shapes.at(0).dtype, DT_FLOAT);
+  EXPECT_EQ(dtypes_and_shapes.at(0).shape.IsIdenticalTo({2, 2}), true);
 
   CompositeDevice* device = reinterpret_cast<CompositeDevice*>(
       absl::get<Device*>(packed_handle->device()));
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device_test.cc b/tensorflow/core/common_runtime/gpu/gpu_device_test.cc
index dae744380e9..6448fc56af7 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device_test.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_device_test.cc
@@ -234,9 +234,9 @@ TEST_F(GPUDeviceTest, SingleVirtualDeviceWithInvalidPriority) {
     SessionOptions opts =
         MakeSessionOptions("0", 0, 1, {{123, 456}}, {{-1, 2}});
 #else
-    // Priority outside the range (-1, 0) for NVidia GPUs
+    // Priority outside the range (-2, 0) for NVidia GPUs
     SessionOptions opts =
-        MakeSessionOptions("0", 0, 1, {{123, 456}}, {{-2, 0}});
+        MakeSessionOptions("0", 0, 1, {{123, 456}}, {{-3, 0}});
 #endif
     std::vector<std::unique_ptr<Device>> devices;
     Status status = DeviceFactory::GetFactory("GPU")->CreateDevices(
@@ -249,9 +249,7 @@ TEST_F(GPUDeviceTest, SingleVirtualDeviceWithInvalidPriority) {
         " virtual device 0 on GPU# 0");
 #else
     ExpectErrorMessageSubstr(
-        status,
-        "Priority -2 is outside the range of supported priorities [-1,0] for"
-        " virtual device 0 on GPU# 0");
+        status, "Priority -3 is outside the range of supported priorities");
 #endif
   }
   {
@@ -259,7 +257,7 @@ TEST_F(GPUDeviceTest, SingleVirtualDeviceWithInvalidPriority) {
     // Priority outside the range (0, 2) for AMD GPUs
     SessionOptions opts = MakeSessionOptions("0", 0, 1, {{123, 456}}, {{0, 3}});
 #else
-    // Priority outside the range (-1, 0) for NVidia GPUs
+    // Priority outside the range (-2, 0) for NVidia GPUs
     SessionOptions opts = MakeSessionOptions("0", 0, 1, {{123, 456}}, {{0, 1}});
 #endif
     std::vector<std::unique_ptr<Device>> devices;
@@ -273,9 +271,7 @@ TEST_F(GPUDeviceTest, SingleVirtualDeviceWithInvalidPriority) {
         " virtual device 0 on GPU# 0");
 #else
     ExpectErrorMessageSubstr(
-        status,
-        "Priority 1 is outside the range of supported priorities [-1,0] for"
-        " virtual device 0 on GPU# 0");
+        status, "Priority 1 is outside the range of supported priorities");
 #endif
   }
 }
@@ -296,7 +292,7 @@ TEST_F(GPUDeviceTest, MultipleVirtualDevices) {
   // Valid range for priority values on AMD GPUs in (0,2)
   SessionOptions opts = MakeSessionOptions("0", 0, 1, {{123, 456}}, {{0, 1}});
 #else
-  // Valid range for priority values on NVidia GPUs in (-1, 0)
+  // Valid range for priority values on NVidia GPUs in (-2, 0)
   SessionOptions opts = MakeSessionOptions("0", 0, 1, {{123, 456}}, {{0, -1}});
 #endif
   std::vector<std::unique_ptr<Device>> devices;
@@ -347,7 +343,7 @@ TEST_F(GPUDeviceTest, MultipleVirtualDevicesWithPriority) {
     // Valid range for priority values on AMD GPUs in (0,2)
     SessionOptions opts = MakeSessionOptions("0", 0, 1, {{123, 456}}, {{2, 1}});
 #else
-    // Valid range for priority values on NVidia GPUs in (-1, 0)
+    // Valid range for priority values on NVidia GPUs in (-2, 0)
     SessionOptions opts =
         MakeSessionOptions("0", 0, 1, {{123, 456}}, {{-1, 0}});
 #endif
diff --git a/tensorflow/core/common_runtime/mkl_layout_pass.cc b/tensorflow/core/common_runtime/mkl_layout_pass.cc
index f6e42fc7e8c..778d5445cb2 100644
--- a/tensorflow/core/common_runtime/mkl_layout_pass.cc
+++ b/tensorflow/core/common_runtime/mkl_layout_pass.cc
@@ -682,12 +682,15 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     rinfo_.push_back(
         {csinfo_.requantize, mkl_op_registry::GetMklOpName(csinfo_.requantize),
          CopyAttrsAll, AlwaysRewrite, kRewriteForLayoutPropagation});
+#ifdef ENABLE_MKLDNN_V1
+    // Optimized TanhGrad support exists only in DNNL 1.x.
     rinfo_.push_back({csinfo_.tanh, mkl_op_registry::GetMklOpName(csinfo_.tanh),
                       CopyAttrsAll, AlwaysRewrite,
                       kRewriteForLayoutPropagation});
     rinfo_.push_back(
         {csinfo_.tanh_grad, mkl_op_registry::GetMklOpName(csinfo_.tanh_grad),
          CopyAttrsAll, AlwaysRewrite, kRewriteForLayoutPropagation});
+#endif  // ENABLE_MKLDNN_V1
     rinfo_.push_back(
         {csinfo_.reshape, mkl_op_registry::GetMklOpName(csinfo_.reshape),
          CopyAttrsAll, AlwaysRewrite, kRewriteForLayoutPropagation});
diff --git a/tensorflow/core/common_runtime/mkl_layout_pass_test.cc b/tensorflow/core/common_runtime/mkl_layout_pass_test.cc
index 9971f6c5d7e..d480c0a49ce 100644
--- a/tensorflow/core/common_runtime/mkl_layout_pass_test.cc
+++ b/tensorflow/core/common_runtime/mkl_layout_pass_test.cc
@@ -3024,6 +3024,8 @@ REGISTER_TEST_ALL_TYPES(NodeRewrite_LeakyReluLeakyReluGrad_Positive);
 // clang-format on
 
 // clang-format off
+#ifdef ENABLE_MKLDNN_V1
+
 #define REGISTER_TEST(NAME, T, INPUT)                                        \
   TEST_F(MklLayoutPassTest, NAME##_##T) {                                    \
     DCHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);     \
@@ -3081,6 +3083,7 @@ REGISTER_TEST_ALL_TYPES(NodeRewrite_TanhGrad_Positive);
 }
 REGISTER_TEST_ALL_TYPES(NodeRewrite_TanhTanhGrad_Positive);
 #undef REGISTER_TEST
+#endif  // ENABLE_MKLDNN_V1
 // clang-format on
 
 TEST_F(MklLayoutPassTest, NodeRewrite_AvgPool_Positive) {
diff --git a/tensorflow/core/common_runtime/process_function_library_runtime.cc b/tensorflow/core/common_runtime/process_function_library_runtime.cc
index 160b9dd88f4..5ee6546f6be 100644
--- a/tensorflow/core/common_runtime/process_function_library_runtime.cc
+++ b/tensorflow/core/common_runtime/process_function_library_runtime.cc
@@ -1356,11 +1356,22 @@ void ProcessFunctionLibraryRuntime::Run(
       // "Index"s of _Arg nodes are unique when all arguments are local Tensors.
       for (const auto& it : comp_data.arg_indices) {
         if (it.sub_index >= 0) {
-          return errors::InvalidArgument("Got unexpected sub_index ",
-                                         it.sub_index, " for argument ",
-                                         it.index);
+          const Tensor& t = args[it.index];
+          if (t.dtype() != DT_RESOURCE) {
+            return errors::InvalidArgument("Got unexpected sub_index ",
+                                           it.sub_index, " for argument ",
+                                           it.index);
+          }
+          const auto& handles = t.flat<ResourceHandle>();
+          if (it.sub_index >= handles.size()) {
+            return errors::InvalidArgument(
+                "Sub_index ", it.sub_index, "is out of range [0,",
+                handles.size(), ") for argument ", it.index);
+          }
+          comp_args->args.push_back(Tensor(handles(it.sub_index)));
+        } else {
+          comp_args->args.push_back(args[it.index]);
         }
-        comp_args->args.push_back(args[it.index]);
       }
       return Status::OK();
     };
diff --git a/tensorflow/core/common_runtime/process_function_library_runtime_test.cc b/tensorflow/core/common_runtime/process_function_library_runtime_test.cc
index a007501fc82..6e17cdf4316 100644
--- a/tensorflow/core/common_runtime/process_function_library_runtime_test.cc
+++ b/tensorflow/core/common_runtime/process_function_library_runtime_test.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "tensorflow/core/framework/resource_var.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/framework/type_index.h"
+#include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/core/threadpool.h"
@@ -867,14 +868,29 @@ TEST_F(ProcessFunctionLibraryRuntimeTest, MultiDevice_CompositeDevice) {
   inst_opts.input_resource_dtypes_and_shapes[0] = {
       initial_resource_value0.dtype(), initial_resource_value0.shape()};
 
-  gtl::InlinedVector<TensorValue, 4> handles;
-  handles.push_back(TensorValue(&resource_handle0));
-  handles.push_back(TensorValue(&resource_handle1));
-  TestFunctionPackedArgs args(0, std::move(handles));
-  Tensor ret;
-  TF_CHECK_OK(RunWithPackedArgs("AddVarAcrossDevices", opts, {{"T", DT_FLOAT}},
-                                inst_opts, args, {&ret}));
-  test::ExpectTensorEqual<float>(ret, test::AsTensor<float>({40, 60}));
+  // Packed TensorHandle
+  {
+    gtl::InlinedVector<TensorValue, 4> handles;
+    handles.push_back(TensorValue(&resource_handle0));
+    handles.push_back(TensorValue(&resource_handle1));
+    TestFunctionPackedArgs args(0, std::move(handles));
+    Tensor ret;
+    TF_CHECK_OK(RunWithPackedArgs("AddVarAcrossDevices", opts,
+                                  {{"T", DT_FLOAT}}, inst_opts, args, {&ret}));
+    test::ExpectTensorEqual<float>(ret, test::AsTensor<float>({40, 60}));
+  }
+
+  // Packed Tensor
+  {
+    Tensor arg(DT_RESOURCE, TensorShape({2}));
+    arg.flat<ResourceHandle>()(0) = resource_handle0.scalar<ResourceHandle>()();
+    arg.flat<ResourceHandle>()(1) = resource_handle1.scalar<ResourceHandle>()();
+
+    Tensor ret;
+    TF_CHECK_OK(Run("AddVarAcrossDevices", opts, {{"T", DT_FLOAT}}, inst_opts,
+                    {arg}, {&ret}));
+    test::ExpectTensorEqual<float>(ret, test::AsTensor<float>({40, 60}));
+  }
 }
 
 TEST_F(ProcessFunctionLibraryRuntimeTest, MultiDevice_ResourceOutput_GPU) {
diff --git a/tensorflow/core/data/service/master_impl.cc b/tensorflow/core/data/service/master_impl.cc
index 37a884d540e..5c7917b4154 100644
--- a/tensorflow/core/data/service/master_impl.cc
+++ b/tensorflow/core/data/service/master_impl.cc
@@ -61,7 +61,8 @@ Status DataServiceMasterImpl::RegisterWorker(
   VLOG(3) << "Received register worker request";
   mutex_lock l(mu_);
   int64 worker_id = next_worker_id_++;
-  workers_.emplace_back(worker_id, request->worker_address());
+  workers_.push_back(
+      std::make_shared<Worker>(worker_id, request->worker_address()));
   response->set_worker_id(worker_id);
 
   // Allocate tasks to the worker.
@@ -70,17 +71,18 @@ Status DataServiceMasterImpl::RegisterWorker(
     if (job->finished()) {
       continue;
     }
-    int64 task_id = CreateTask(job.get(), request->worker_address());
+    const Task& task = CreateTaskLocked(job.get(), request->worker_address());
 
     TaskDef* task_def = response->add_tasks();
     *task_def->mutable_dataset() =
         datasets_by_id_[job->dataset_id()]->dataset_def();
     task_def->set_dataset_id(job->dataset_id());
     task_def->set_job_id(job->job_id());
-    task_def->set_task_id(task_id);
+    task_def->set_task_id(task.task_id());
   }
 
-  VLOG(1) << "Registered worker " << workers_.back().DebugString();
+  VLOG(1) << "Registered worker at address " << request->worker_address()
+          << " with id " << worker_id;
   return Status::OK();
 }
 
@@ -145,7 +147,6 @@ Status DataServiceMasterImpl::CreateJob(const CreateJobRequest* request,
   VLOG(3) << "Received create job request for dataset id "
           << request->dataset_id();
   ProcessingMode processing_mode = ProcessingMode(request->processing_mode());
-  mutex_lock l(mu_);
   int64 job_id;
   TF_RETURN_IF_ERROR(CreateJob(request->dataset_id(), processing_mode,
                                absl::optional<std::string>(), &job_id));
@@ -161,25 +162,30 @@ Status DataServiceMasterImpl::GetOrCreateJob(
   VLOG(3) << "Received get or create job request for dataset id "
           << request->dataset_id() << " with name " << request->job_name()
           << " and index " << request->job_name_index();
-  mutex_lock l(mu_);
   NamedJobKey key(request->job_name(), request->job_name_index());
   ProcessingMode requested_processing_mode =
       ProcessingMode(request->processing_mode());
-  std::shared_ptr<Job>* job = gtl::FindOrNull(named_jobs_, key);
-  if (job != nullptr) {
-    TF_RETURN_IF_ERROR(ValidateMatchingJob(**job, requested_processing_mode,
-                                           request->dataset_id()));
-    int64 job_id = (*job)->job_id();
-    response->set_job_id(job_id);
-    VLOG(3) << "Found existing job for name=" << request->job_name()
-            << ", index=" << request->job_name_index()
-            << ". job_id: " << job_id;
-    return Status::OK();
+  {
+    mutex_lock l(mu_);
+    std::shared_ptr<Job>* job = gtl::FindOrNull(named_jobs_, key);
+    if (job != nullptr) {
+      TF_RETURN_IF_ERROR(ValidateMatchingJob(**job, requested_processing_mode,
+                                             request->dataset_id()));
+      int64 job_id = (*job)->job_id();
+      response->set_job_id(job_id);
+      VLOG(3) << "Found existing job for name=" << request->job_name()
+              << ", index=" << request->job_name_index()
+              << ". job_id: " << job_id;
+      return Status::OK();
+    }
   }
   int64 job_id;
   TF_RETURN_IF_ERROR(CreateJob(request->dataset_id(), requested_processing_mode,
                                request->job_name(), &job_id));
-  named_jobs_[key] = jobs_[job_id];
+  {
+    mutex_lock l(mu_);
+    named_jobs_[key] = jobs_[job_id];
+  }
   response->set_job_id(job_id);
   VLOG(3) << "Created job " << job_id << " for dataset "
           << request->dataset_id() << " and name " << request->job_name();
@@ -211,8 +217,7 @@ Status DataServiceMasterImpl::ValidateMatchingJob(
 Status DataServiceMasterImpl::CreateJob(int64 dataset_id,
                                         ProcessingMode processing_mode,
                                         absl::optional<std::string> job_name,
-                                        int64* out_job_id)
-    EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+                                        int64* out_job_id) LOCKS_EXCLUDED(mu_) {
   switch (processing_mode) {
     case ProcessingMode::PARALLEL_EPOCHS:
       break;
@@ -225,41 +230,64 @@ Status DataServiceMasterImpl::CreateJob(int64 dataset_id,
                                    ProcessingModeToString(processing_mode),
                                    " not recognized");
   }
-  if (!datasets_by_id_.contains(dataset_id)) {
-    return errors::NotFound("Dataset id: <", dataset_id, "> not found.");
+  std::shared_ptr<Job> job;
+  std::vector<std::shared_ptr<Worker>> workers;
+  {
+    mutex_lock l(mu_);
+    if (!datasets_by_id_.contains(dataset_id)) {
+      return errors::NotFound("Dataset id: <", dataset_id, "> not found.");
+    }
+
+    int64 job_id = next_job_id_++;
+    DCHECK(!jobs_.contains(job_id));
+    job = std::make_shared<Job>(job_id, dataset_id, processing_mode, job_name);
+    jobs_[job_id] = job;
+
+    // Copy workers_ so that we can iterate through the workers without holding
+    // the lock. When a new worker is added in `RegisterWorker`, we iterate
+    // through the jobs in `jobs_` and give it a task for each job. So even if a
+    // new worker is registered after we release the lock, because this job has
+    // been added to `jobs_`, it will still receive a task for this job.
+    workers = workers_;
+    const Dataset& dataset = *datasets_by_id_[dataset_id];
+    if (VLOG_IS_ON(1)) {
+      VLOG(1) << "Sending tasks to workers for job " << job->job_id()
+              << ". Dataset id: " << dataset_id
+              << ". Dataset fingerprint: " << dataset.fingerprint()
+              << ". Dataset definition size: "
+              << datasets_by_id_[dataset_id]->dataset_def().ByteSizeLong();
+    }
   }
 
-  int64 job_id = next_job_id_++;
-  DCHECK(!jobs_.contains(job_id));
-  auto job =
-      std::make_shared<Job>(job_id, dataset_id, processing_mode, job_name);
-  jobs_[job_id] = job;
-
-  for (auto& worker : workers_) {
-    int64 task_id = CreateTask(job.get(), worker.address());
-
-    // TODO(aaudibert): perform these calls asynchronously.
-    // TODO(aaudibert): clean up in case some calls succeed, but later calls
-    // fail
-    TF_RETURN_IF_ERROR(AllocateTaskToWorker(tasks_.at(task_id), &worker));
+  for (auto& worker : workers) {
+    const Task& task = CreateTask(job.get(), worker->address());
+    Status s = AllocateTaskToWorker(task, worker.get());
+    if (!s.ok()) {
+      LOG(WARNING) << "Failed to allocate task with id " << task.task_id()
+                   << " to worker at address " << worker->address() << ": "
+                   << s.error_message();
+    }
   }
+  VLOG(1) << "Done sending tasks to workers for job " << job->job_id();
 
-  *out_job_id = job_id;
+  *out_job_id = job->job_id();
   return Status::OK();
 }
 
-int64 DataServiceMasterImpl::CreateTask(Job* job,
-                                        const std::string& worker_address)
-    EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+const DataServiceMasterImpl::Task& DataServiceMasterImpl::CreateTask(
+    Job* job, const std::string& worker_address) LOCKS_EXCLUDED(mu_) {
+  mutex_lock l(mu_);
+  return CreateTaskLocked(job, worker_address);
+}
+
+const DataServiceMasterImpl::Task& DataServiceMasterImpl::CreateTaskLocked(
+    Job* job, const std::string& worker_address) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
   int64 task_id = next_task_id_++;
   DCHECK(!tasks_.contains(task_id));
-  auto result =
-      tasks_.emplace(std::piecewise_construct, std::forward_as_tuple(task_id),
-                     std::forward_as_tuple(task_id, job->job_id(),
-                                           job->dataset_id(), worker_address));
+  tasks_.insert({task_id, Task(task_id, job->job_id(), job->dataset_id(),
+                               worker_address)});
   job->add_task_id(task_id);
-  DCHECK(result.second);
-  return task_id;
+  return tasks_.at(task_id);
 }
 
 Status DataServiceMasterImpl::EnsureWorkerStubInitialized(Worker* worker) {
@@ -273,14 +301,17 @@ Status DataServiceMasterImpl::EnsureWorkerStubInitialized(Worker* worker) {
 
 Status DataServiceMasterImpl::AllocateTaskToWorker(const Task& task,
                                                    Worker* worker)
-    EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+    LOCKS_EXCLUDED(mu_) {
   TF_RETURN_IF_ERROR(EnsureWorkerStubInitialized(worker));
   grpc::ClientContext client_ctx;
   ProcessTaskRequest req;
   req.mutable_task()->set_dataset_id(task.dataset_id());
-  DCHECK(datasets_by_id_.contains(task.dataset_id()));
-  *req.mutable_task()->mutable_dataset() =
-      datasets_by_id_.at(task.dataset_id())->dataset_def();
+  {
+    mutex_lock l(mu_);
+    DCHECK(datasets_by_id_.contains(task.dataset_id()));
+    *req.mutable_task()->mutable_dataset() =
+        datasets_by_id_.at(task.dataset_id())->dataset_def();
+  }
   req.mutable_task()->set_task_id(task.task_id());
   ProcessTaskResponse resp;
   grpc::Status s = worker->stub()->ProcessTask(&client_ctx, req, &resp);
@@ -321,8 +352,8 @@ Status DataServiceMasterImpl::GetWorkers(const GetWorkersRequest* request,
   VLOG(3) << "Enter GetWorkers";
   for (auto& worker : workers_) {
     WorkerInfo* info = response->add_workers();
-    info->set_address(worker.address());
-    info->set_id(worker.worker_id());
+    info->set_address(worker->address());
+    info->set_id(worker->worker_id());
   }
   VLOG(3) << "Returning list of " << workers_.size()
           << " workers from GetWorkers";
diff --git a/tensorflow/core/data/service/master_impl.h b/tensorflow/core/data/service/master_impl.h
index 0dc049a389c..67df2613118 100644
--- a/tensorflow/core/data/service/master_impl.h
+++ b/tensorflow/core/data/service/master_impl.h
@@ -177,16 +177,23 @@ class DataServiceMasterImpl {
   };
 
   // Registers a dataset with the given fingerprint, returning a new dataset id.
-  int64 RegisterDataset(uint64 fingerprint, const DatasetDef& dataset);
+  int64 RegisterDataset(uint64 fingerprint, const DatasetDef& dataset)
+      EXCLUSIVE_LOCKS_REQUIRED(mu_);
   // Initializes a workers stub, if it hasn't been initialized already.
   Status EnsureWorkerStubInitialized(Worker* worker);
   // Instructs a worker to begin processing a task.
-  Status AllocateTaskToWorker(const Task& task_id, Worker* worker);
+  Status AllocateTaskToWorker(const Task& task_id, Worker* worker)
+      LOCKS_EXCLUDED(mu_);
   // Creates a job and stores its job_id in `*job_id`.
   Status CreateJob(int64 dataset_id, ProcessingMode processing_mode,
-                   absl::optional<std::string> job_name, int64* out_job_id);
-  // Creates a new task for a job, returning the new task's id.
-  int64 CreateTask(Job* job, const std::string& worker_address);
+                   absl::optional<std::string> job_name, int64* out_job_id)
+      LOCKS_EXCLUDED(mu_);
+  // Creates a new task for a job, returning a reference to the task.
+  const Task& CreateTask(Job* job, const std::string& worker_address)
+      LOCKS_EXCLUDED(mu_);
+  // Same as `CreateTask`, but expects that the master lock is already held.
+  const Task& CreateTaskLocked(Job* job, const std::string& worker_address)
+      EXCLUSIVE_LOCKS_REQUIRED(mu_);
   // Validates that an existing job matches the given processing_mode and
   // dataset_id, returning an error status describing any difference.
   Status ValidateMatchingJob(const Job& job, ProcessingMode processing_mode,
@@ -202,7 +209,7 @@ class DataServiceMasterImpl {
   int64 next_task_id_ TF_GUARDED_BY(mu_) = 0;
 
   // Registered workers.
-  std::vector<Worker> workers_ TF_GUARDED_BY(mu_);
+  std::vector<std::shared_ptr<Worker>> workers_ TF_GUARDED_BY(mu_);
   // Registered datasets, keyed by dataset ids.
   absl::flat_hash_map<int64, std::shared_ptr<Dataset>> datasets_by_id_
       TF_GUARDED_BY(mu_);
diff --git a/tensorflow/core/distributed_runtime/base_rendezvous_mgr.cc b/tensorflow/core/distributed_runtime/base_rendezvous_mgr.cc
index 7849e094cb9..4b398e4ecef 100644
--- a/tensorflow/core/distributed_runtime/base_rendezvous_mgr.cc
+++ b/tensorflow/core/distributed_runtime/base_rendezvous_mgr.cc
@@ -139,7 +139,7 @@ Status BaseRemoteRendezvous::Initialize(WorkerSession* session) {
   CHECK_NE(session, nullptr) << "session must not be null!";
   std::vector<DeferredCall> deferred_calls;
   {
-    mutex_lock l(init_mu_);
+    mutex_lock l(mu_);
     if (session_ != nullptr) {
       if (session_->worker_name() == session->worker_name()) {
         VLOG(1) << "Skipping rendezvous re-initialization.";
@@ -161,12 +161,12 @@ Status BaseRemoteRendezvous::Initialize(WorkerSession* session) {
 }
 
 WorkerSession* BaseRemoteRendezvous::session() {
-  tf_shared_lock l(init_mu_);
+  tf_shared_lock l(mu_);
   return session_;
 }
 
 bool BaseRemoteRendezvous::is_initialized() {
-  tf_shared_lock l(init_mu_);
+  tf_shared_lock l(mu_);
   return is_initialized_locked();
 }
 
@@ -176,7 +176,7 @@ Status BaseRemoteRendezvous::Send(const Rendezvous::ParsedKey& parsed,
   VLOG(1) << "BaseRemoteRendezvous Send " << this << " " << parsed.FullKey();
   WorkerSession* sess = nullptr;
   {
-    tf_shared_lock l(init_mu_);
+    tf_shared_lock l(mu_);
     if (!status_.ok()) return status_;
     DCHECK(is_initialized_locked());
     sess = session_;
@@ -198,7 +198,7 @@ Status BaseRemoteRendezvous::ValidateDevices(const ParsedKey& parsed,
   // (e.g. calling session())
   WorkerSession* sess = nullptr;
   {
-    tf_shared_lock l(init_mu_);
+    tf_shared_lock l(mu_);
     if (!status_.ok()) return status_;
     if (!is_initialized_locked()) {
       return errors::Internal("ValidateDevices called before initialization.");
@@ -345,7 +345,7 @@ void BaseRemoteRendezvous::RecvLocalAsync(const ParsedKey& parsed,
   // Test whether the rendezvous is initialized using a shared lock, to avoid
   // the need for exclusive access in the common case.
   if (TF_PREDICT_FALSE(!is_initialized())) {
-    mutex_lock l(init_mu_);
+    mutex_lock l(mu_);
     if (!is_initialized_locked()) {
       // RecvLocalAsync can be called (due to an incoming RecvTensor RPC from a
       // remote worker) before the RunStep (or PartialRunStep) RPC from the
@@ -386,8 +386,7 @@ void BaseRemoteRendezvous::StartAbort(const Status& s) {
   local_->StartAbort(derived_status);
   {
     // Aborts all active RecvTensor calls.
-    mutex_lock l(init_mu_);
-    mutex_lock l2(active_mu_);
+    mutex_lock l(mu_);
     if (status_.ok()) {
       status_ = derived_status;
       for (auto& entry : active_) {
@@ -402,42 +401,36 @@ void BaseRemoteRendezvous::StartAbort(const Status& s) {
 void BaseRemoteRendezvous::RegisterCall(BaseRecvTensorCall* call,
                                         const Rendezvous::Args& args) {
   CancellationManager* cm = args.cancellation_manager;
-  Status captured_status;
-  {
-    tf_shared_lock l(init_mu_);
-    if (!status_.ok()) {
-      captured_status = status_;
-    }
-  }
-  if (!captured_status.ok()) {
-    call->StartAbort(captured_status);
-    return;
-  }
-
   bool already_cancelled = false;
   InactiveCallback callback = [] {};
-  if (cm != nullptr) {
-    auto token = cm->get_cancellation_token();
-    already_cancelled = !cm->RegisterCallback(token, [this, call] {
-      {
-        tf_shared_lock l(active_mu_);
-        if (active_.find(call) == active_.end()) return;
-      }
+  {
+    mutex_lock l(mu_);
+    if (!status_.ok()) {
+      call->StartAbort(status_);
+      return;
+    }
+    if (cm != nullptr) {
+      auto token = cm->get_cancellation_token();
+      already_cancelled = !cm->RegisterCallback(token, [this, call] {
+        {
+          mutex_lock l(mu_);
+          if (active_.find(call) == active_.end()) return;
+          call->StartAbort(
+              errors::Cancelled("RecvFromRemoteAsync is cancelled."));
+        }
+      });
+      callback = [cm, token] { cm->TryDeregisterCallback(token); };
+    }
+    if (already_cancelled) {
       call->StartAbort(errors::Cancelled("RecvFromRemoteAsync is cancelled."));
-    });
-    callback = [cm, token] { cm->TryDeregisterCallback(token); };
-  }
-
-  if (already_cancelled) {
-    call->StartAbort(errors::Cancelled("RecvFromRemoteAsync is cancelled."));
-  } else {
-    mutex_lock l(active_mu_);
-    CHECK(active_.emplace(call, callback).second);
+    } else {
+      CHECK(active_.emplace(call, callback).second);
+    }
   }
 }
 
 void BaseRemoteRendezvous::DeregisterCall(BaseRecvTensorCall* call) {
-  mutex_lock l(active_mu_);
+  mutex_lock l(mu_);
   auto it = active_.find(call);
   if (it != active_.end()) {
     // Deregister the cancellation callback, if one was registered.
diff --git a/tensorflow/core/distributed_runtime/base_rendezvous_mgr.h b/tensorflow/core/distributed_runtime/base_rendezvous_mgr.h
index afa0f74ea2c..63409a31549 100644
--- a/tensorflow/core/distributed_runtime/base_rendezvous_mgr.h
+++ b/tensorflow/core/distributed_runtime/base_rendezvous_mgr.h
@@ -174,14 +174,12 @@ class BaseRemoteRendezvous : public RemoteRendezvous {
  private:
   Rendezvous* local_;  // Owns a Ref on this object.
 
-  // Guards mutable state that is read-mostly after this rendezvous is
-  // initialized.
-  mutable mutex init_mu_;
+  mutable mutex mu_;
 
   // Status given by StartAbort() if any.
-  Status status_ TF_GUARDED_BY(init_mu_);
+  Status status_ TF_GUARDED_BY(mu_);
 
-  WorkerSession* session_ TF_GUARDED_BY(init_mu_);  // Not owned.
+  WorkerSession* session_ TF_GUARDED_BY(mu_);  // Not owned.
 
   // Data structures to handle calls when partially initialized.
   struct DeferredCall {
@@ -190,16 +188,14 @@ class BaseRemoteRendezvous : public RemoteRendezvous {
 
     DeferredCall(const ParsedKey& parsed, DoneCallback done);
   };
-  std::vector<DeferredCall> deferred_calls_ TF_GUARDED_BY(init_mu_);
+  std::vector<DeferredCall> deferred_calls_ TF_GUARDED_BY(mu_);
 
   typedef std::function<void()> InactiveCallback;
 
-  // Active outstanding RecvTensor calls.
-  mutex active_mu_;
   std::unordered_map<BaseRecvTensorCall*, InactiveCallback> active_
-      TF_GUARDED_BY(active_mu_);
+      TF_GUARDED_BY(mu_);
 
-  bool is_initialized_locked() TF_SHARED_LOCKS_REQUIRED(init_mu_) {
+  bool is_initialized_locked() TF_SHARED_LOCKS_REQUIRED(mu_) {
     return session_ != nullptr;
   }
 
diff --git a/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc b/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc
index f1e70d53757..4735ff6eaf6 100644
--- a/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc
+++ b/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc
@@ -491,7 +491,11 @@ Status EagerServiceImpl::ExecuteOp(const Operation& operation,
 
   absl::FixedArray<tensorflow::TensorHandle*> retvals(num_retvals);
   VLOG(3) << "ServerContext: Calling EagerExecute for op " << operation.id();
-  TF_RETURN_IF_ERROR(EagerExecute(&op, retvals.data(), &num_retvals));
+  TF_RETURN_IF_ERROR(op.Execute(
+      absl::MakeSpan(
+          reinterpret_cast<tensorflow::AbstractTensorHandle**>(retvals.data()),
+          num_retvals),
+      &num_retvals));
 
   return AddOpRetvalsToResponse(
       eager_context, operation.id(), num_retvals, retvals.data(),
@@ -685,7 +689,7 @@ Status EagerServiceImpl::SendPackedHandle(
   // Create a unshaped packed TensorHandle.
   TF_RETURN_IF_ERROR(TensorHandle::CreatePackedHandle(
       std::move(handles_to_pack), handles.at(0)->dtype, TensorShape(),
-      eager_context, &packed_handle));
+      send_packed_handle.device_name(), eager_context, &packed_handle));
 
   for (auto* h : handles) {
     // Unref handle since it has a ref in the packed handle now.
diff --git a/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc b/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc
index 7a315ca1ea5..a2412eb9625 100644
--- a/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc
+++ b/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc
@@ -1071,6 +1071,8 @@ TEST_F(EagerServiceImplTest, SendPackedHandleTest) {
   const string device0 = "/job:localhost/replica:0/task:0/device:CPU:0";
   const string device1 = "/job:localhost/replica:0/task:1/device:CPU:0";
   const string device2 = "/job:localhost/replica:0/task:2/device:CPU:0";
+  const string composite_device =
+      "/job:localhost/replica:0/task:0/device:COMPOSITE:0";
 
   uint64 context_id = random::New64();
   CreateContextRequest request;
@@ -1125,6 +1127,8 @@ TEST_F(EagerServiceImplTest, SendPackedHandleTest) {
 
   EXPECT_EQ(packed_handle->Type(), TensorHandle::PACKED);
   EXPECT_EQ(packed_handle->NumPackedHandles(), 3);
+  EXPECT_EQ(absl::get<Device*>(packed_handle->device())->name(),
+            composite_device);
 
   TensorHandle* handle0 = nullptr;
   TF_ASSERT_OK(packed_handle->ExtractPackedHandle(0, &handle0));
diff --git a/tensorflow/core/distributed_runtime/eager/remote_copy_node.cc b/tensorflow/core/distributed_runtime/eager/remote_copy_node.cc
index 41027d43188..a5eeed6a0b6 100644
--- a/tensorflow/core/distributed_runtime/eager/remote_copy_node.cc
+++ b/tensorflow/core/distributed_runtime/eager/remote_copy_node.cc
@@ -298,6 +298,7 @@ Status SerializePackedHandle(const uint64 op_id, TensorHandle* packed_handle,
                              const Device* target_device, EagerContext* ctx,
                              SendPackedHandleOp* op) {
   op->set_op_id(op_id);
+  op->set_device_name(VariantDeviceName(packed_handle->DeviceOrHostCPU(*ctx)));
   for (int i = 0; i < packed_handle->NumPackedHandles(); ++i) {
     TensorHandle* h = nullptr;
     TF_RETURN_IF_ERROR(packed_handle->ExtractPackedHandle(i, &h));
diff --git a/tensorflow/core/distributed_runtime/eager/remote_mgr.cc b/tensorflow/core/distributed_runtime/eager/remote_mgr.cc
index 94a4f199337..9003f2b3f17 100644
--- a/tensorflow/core/distributed_runtime/eager/remote_mgr.cc
+++ b/tensorflow/core/distributed_runtime/eager/remote_mgr.cc
@@ -167,24 +167,22 @@ Status RemoteMgr::DeserializeRemoteTensorHandle(const RemoteTensorHandle& in,
         parent_->FindDeviceFromName(device_name.c_str(), &device));
     *out = TensorHandle::CreateLazyRemoteHandle(in.op_id(), in.output_num(),
                                                 in.dtype(), device, parent_);
-    TensorHandle::ResourceHandleInfo resource_handle_info;
-    std::vector<DtypeAndPartialTensorShape>* dtypes_and_shapes =
-        &resource_handle_info.dtypes_and_shapes;
+    std::vector<DtypeAndPartialTensorShape> dtypes_and_shapes;
     if (!GetMirroredResourceShape(RemoteTensorHandleInternal(in),
-                                  dtypes_and_shapes)
+                                  &dtypes_and_shapes)
              .ok()) {
       for (const auto& dtype_and_shape_proto :
            in.resource_dtypes_and_shapes()) {
-        dtypes_and_shapes->push_back(DtypeAndPartialTensorShape{
+        dtypes_and_shapes.push_back(DtypeAndPartialTensorShape{
             dtype_and_shape_proto.dtype(),
             TensorShape(dtype_and_shape_proto.shape())});
       }
       mutex_lock l(mirrored_resource_shape_mu_);
       mirrored_resource_shape_map_.emplace(
           RemoteTensorHandleInternal(in.op_id(), in.output_num()),
-          *dtypes_and_shapes);
+          dtypes_and_shapes);
     }
-    (*out)->SetResourceHandleInfo(std::move(resource_handle_info));
+    (*out)->SetResourceHandleDtypeAndShape(std::move(dtypes_and_shapes));
   }
 
   return Status::OK();
diff --git a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service_impl.h b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service_impl.h
index 924112e0d96..1d65f945f27 100644
--- a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service_impl.h
+++ b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service_impl.h
@@ -81,6 +81,7 @@ class GrpcEagerServiceImpl : public AsyncServiceInterface {
       local_impl_.RunComponentFunction(call_opts.get(), &call->request,
                                        &call->response,
                                        [call, call_opts](const Status& s) {
+                                         call->ClearCancelCallback();
                                          call->SendResponse(ToGrpcStatus(s));
                                        });
     });
diff --git a/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.cc b/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.cc
index 512c17fcfcf..89fe6ced725 100644
--- a/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.cc
+++ b/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.cc
@@ -282,6 +282,7 @@ void RpcRemoteRendezvous::RecvFromRemoteAsync(
     // callback.
     call->ReleaseWorker(sess->worker_cache());
     call->done()(call->status(), Args(), Args(), Tensor(), false);
+    DeregisterCall(call);
     get_call_freelist()->Release(call);
     return;
   }
diff --git a/tensorflow/core/framework/BUILD b/tensorflow/core/framework/BUILD
index 52f15dcb5c2..d47c74a629d 100644
--- a/tensorflow/core/framework/BUILD
+++ b/tensorflow/core/framework/BUILD
@@ -719,6 +719,7 @@ tf_cuda_library(
     visibility = [
         "//tensorflow/core:__pkg__",
         "//tensorflow/core/util:__pkg__",
+        "//tensorflow/security/fuzzing:__subpackages__",
     ],
     deps = [
         ":allocation_description_proto_cc",
diff --git a/tensorflow/core/framework/common_shape_fns.cc b/tensorflow/core/framework/common_shape_fns.cc
index 216002ad8e7..b9efddf4cdb 100644
--- a/tensorflow/core/framework/common_shape_fns.cc
+++ b/tensorflow/core/framework/common_shape_fns.cc
@@ -1955,6 +1955,7 @@ Status BroadcastBinaryOpOutputShapeFnHelper(InferenceContext* c,
       // in C++ op code, we must still assert that the unknown dim is either 1
       // or the same as the known dim.
       // - If either dimension is 1, the other dimension is the output.
+      // - If both are unknown then dimension is unknown
       if (c->Value(dim_x) > 1) {
         if (!incompatible_shape_error) {
           *out = c->UnknownShape();
@@ -1973,6 +1974,8 @@ Status BroadcastBinaryOpOutputShapeFnHelper(InferenceContext* c,
         dims.push_back(dim_x);
       } else if (dim_y.SameHandle(dim_x)) {
         dims.push_back(dim_x);
+      } else if (!c->ValueKnown(dim_x) && !c->ValueKnown(dim_y)) {
+        dims.push_back(c->UnknownDim());
       } else {
         if (!incompatible_shape_error) {
           *out = c->UnknownShape();
diff --git a/tensorflow/core/framework/load_library.cc b/tensorflow/core/framework/load_library.cc
index ab08d644074..22181e1c8be 100644
--- a/tensorflow/core/framework/load_library.cc
+++ b/tensorflow/core/framework/load_library.cc
@@ -22,7 +22,7 @@ limitations under the License.
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/mem.h"
 #if !defined(IS_MOBILE_PLATFORM)
-#include "tensorflow/core/tpu/tpu_library_loader.h"
+#include "tensorflow/core/tpu/tpu_api_dlsym_initializer.h"
 #endif  // IS_MOBILE_PLATFORM
 
 namespace tensorflow {
diff --git a/tensorflow/core/framework/register_types.h b/tensorflow/core/framework/register_types.h
index bc3e5e1743b..0cf6536e8c2 100644
--- a/tensorflow/core/framework/register_types.h
+++ b/tensorflow/core/framework/register_types.h
@@ -153,16 +153,9 @@ limitations under the License.
 #endif  // defined(IS_MOBILE_PLATFORM)  - end of TF_CALL_type defines
 
 // Defines for sets of types.
-
-// TODO(b/111604096): Add uint32 and uint64 to TF_CALL_INTEGRAL_TYPES.
-//
-// The uint32 and uint64 types were introduced in 10/2017 to be used via XLA and
-// thus were not included in TF_CALL_INTEGRAL_TYPES. Including them in
-// TF_CALL_INTEGRAL_TYPES should only happen after evaluating the effect on the
-// TF binary size and performance.
-#define TF_CALL_INTEGRAL_TYPES(m)                                      \
-  TF_CALL_int64(m) TF_CALL_int32(m) TF_CALL_uint16(m) TF_CALL_int16(m) \
-      TF_CALL_uint8(m) TF_CALL_int8(m)
+#define TF_CALL_INTEGRAL_TYPES(m)                                       \
+  TF_CALL_uint64(m) TF_CALL_int64(m) TF_CALL_uint32(m) TF_CALL_int32(m) \
+      TF_CALL_uint16(m) TF_CALL_int16(m) TF_CALL_uint8(m) TF_CALL_int8(m)
 
 #define TF_CALL_FLOAT_TYPES(m) \
   TF_CALL_half(m) TF_CALL_bfloat16(m) TF_CALL_float(m) TF_CALL_double(m)
@@ -174,10 +167,10 @@ limitations under the License.
 #define TF_CALL_REAL_NUMBER_TYPES_NO_BFLOAT16(m) \
   TF_CALL_INTEGRAL_TYPES(m) TF_CALL_half(m) TF_CALL_float(m) TF_CALL_double(m)
 
-#define TF_CALL_REAL_NUMBER_TYPES_NO_INT32(m)                              \
-  TF_CALL_half(m) TF_CALL_bfloat16(m) TF_CALL_float(m) TF_CALL_double(m)   \
-      TF_CALL_int64(m) TF_CALL_uint16(m) TF_CALL_int16(m) TF_CALL_uint8(m) \
-          TF_CALL_int8(m)
+#define TF_CALL_REAL_NUMBER_TYPES_NO_INT32(m)                                \
+  TF_CALL_half(m) TF_CALL_bfloat16(m) TF_CALL_float(m) TF_CALL_double(m)     \
+      TF_CALL_uint64(m) TF_CALL_int64(m) TF_CALL_uint32(m) TF_CALL_uint16(m) \
+          TF_CALL_int16(m) TF_CALL_uint8(m) TF_CALL_int8(m)
 
 #define TF_CALL_COMPLEX_TYPES(m) TF_CALL_complex64(m) TF_CALL_complex128(m)
 
diff --git a/tensorflow/core/framework/resource_handle.cc b/tensorflow/core/framework/resource_handle.cc
index 2db5cfa301c..e7f4c2afc90 100644
--- a/tensorflow/core/framework/resource_handle.cc
+++ b/tensorflow/core/framework/resource_handle.cc
@@ -38,9 +38,6 @@ void ResourceHandle::AsProto(ResourceHandleProto* proto) const {
     dtype_and_shape->set_dtype(dtype_and_shape_pair.dtype);
     dtype_and_shape_pair.shape.AsProto(dtype_and_shape->mutable_shape());
   }
-  for (const string& device : allowed_devices_) {
-    *proto->add_allowed_devices() = device;
-  }
 }
 
 void ResourceHandle::FromProto(const ResourceHandleProto& proto) {
@@ -56,9 +53,6 @@ void ResourceHandle::FromProto(const ResourceHandleProto& proto) {
     dtypes_and_shapes.push_back(DtypeAndPartialTensorShape{dtype, shape});
   }
   dtypes_and_shapes_ = std::move(dtypes_and_shapes);
-  for (const string& device : proto.allowed_devices()) {
-    allowed_devices_.push_back(device);
-  }
 }
 
 string ResourceHandle::SerializeAsString() const {
diff --git a/tensorflow/core/framework/resource_handle.h b/tensorflow/core/framework/resource_handle.h
index 88c9f9da190..9acb94b6e79 100644
--- a/tensorflow/core/framework/resource_handle.h
+++ b/tensorflow/core/framework/resource_handle.h
@@ -39,14 +39,8 @@ class ResourceHandle {
 
   // Unique name for the device containing the resource.
   const std::string& device() const { return device_; }
-  // Names of the devices containing the resource.
-  const std::vector<string>& allowed_devices() const {
-    return allowed_devices_;
-  }
+
   void set_device(const std::string& device) { device_ = device; }
-  void set_allowed_devices(const std::vector<string>& devices) {
-    allowed_devices_ = devices;
-  }
 
   // Container in which this resource is placed.
   const std::string& container() const { return container_; }
@@ -93,12 +87,7 @@ class ResourceHandle {
       "cd2c89b7-88b7-44c8-ad83-06c2a9158347";
 
  public:
-  // The default device containing the resource, where the ResourceHandle is
-  // initially created.
   std::string device_;
-  // A set of devices containing the resource. If empty, the resource only
-  // exists on device_. Can be represented in wildcard patterns.
-  std::vector<string> allowed_devices_;
   std::string container_;
   std::string name_;
   uint64 hash_code_ = 0;
diff --git a/tensorflow/core/framework/resource_handle.proto b/tensorflow/core/framework/resource_handle.proto
index eb0d1631c2f..5a41750475d 100644
--- a/tensorflow/core/framework/resource_handle.proto
+++ b/tensorflow/core/framework/resource_handle.proto
@@ -41,7 +41,5 @@ message ResourceHandleProto {
   // Data types and shapes for the underlying resource.
   repeated DtypeAndShape dtypes_and_shapes = 6;
 
-  // A set of devices containing the resource. If empty, the resource only
-  // exists on `device`.
-  repeated string allowed_devices = 7;
+  reserved 7;
 }
diff --git a/tensorflow/core/framework/resource_mgr.cc b/tensorflow/core/framework/resource_mgr.cc
index fd524b05bb9..e6ecfbb9190 100644
--- a/tensorflow/core/framework/resource_mgr.cc
+++ b/tensorflow/core/framework/resource_mgr.cc
@@ -36,8 +36,7 @@ static std::atomic<int64> current_id_;
 ResourceHandle MakeResourceHandle(
     const string& container, const string& name, const DeviceBase& device,
     const TypeIndex& type_index,
-    const std::vector<DtypeAndPartialTensorShape>& dtypes_and_shapes,
-    const std::vector<string>& allowed_devices) {
+    const std::vector<DtypeAndPartialTensorShape>& dtypes_and_shapes) {
   ResourceHandle result;
   result.set_device(device.name());
   result.set_container(container);
@@ -49,7 +48,6 @@ ResourceHandle MakeResourceHandle(
   result.set_hash_code(type_index.hash_code());
   result.set_maybe_type_name(type_index.name());
   result.set_dtypes_and_shapes(dtypes_and_shapes);
-  result.set_allowed_devices(allowed_devices);
   return result;
 }
 
@@ -67,39 +65,12 @@ Status MakeResourceHandleToOutput(OpKernelContext* context, int output_index,
 namespace internal {
 
 Status ValidateDevice(OpKernelContext* ctx, const ResourceHandle& p) {
-  const string& current_device_name = ctx->device()->attributes().name();
-  if (current_device_name == p.device()) {
-    return Status::OK();
-  }
-  DeviceNameUtils::ParsedName parsed_current_device_name;
-  if (!DeviceNameUtils::ParseFullName(current_device_name,
-                                      &parsed_current_device_name)) {
+  if (ctx->device()->attributes().name() != p.device()) {
     return errors::InvalidArgument(
-        "Cannot parse device name in OpKernelContext: ", current_device_name);
+        "Trying to access resource ", p.name(), " located in device ",
+        p.device(), " from device ", ctx->device()->attributes().name());
   }
-
-  for (const string& device : p.allowed_devices()) {
-    DeviceNameUtils::ParsedName parsed;
-    if (!DeviceNameUtils::ParseFullName(device, &parsed)) {
-      return errors::InvalidArgument("Cannot parse allowed device name: ",
-                                     device);
-    }
-    if (DeviceNameUtils::IsCompleteSpecification(parsed,
-                                                 parsed_current_device_name)) {
-      return Status::OK();
-    }
-  }
-  string error_message = strings::StrCat("Trying to access resource ", p.name(),
-                                         " located in device ", p.device(),
-                                         " from device ", current_device_name);
-  if (!p.allowed_devices().empty()) {
-    absl::StrAppend(&error_message, " (allowed devices: ");
-    for (const string& device : p.allowed_devices()) {
-      absl::StrAppend(&error_message, device, ", ");
-    }
-    absl::StrAppend(&error_message, ") ");
-  }
-  return errors::InvalidArgument(error_message);
+  return Status::OK();
 }
 
 }  // end namespace internal
diff --git a/tensorflow/core/framework/resource_mgr.h b/tensorflow/core/framework/resource_mgr.h
index 3a9b97c7831..b0e4eace16e 100644
--- a/tensorflow/core/framework/resource_mgr.h
+++ b/tensorflow/core/framework/resource_mgr.h
@@ -291,31 +291,27 @@ class ResourceMgr {
 ResourceHandle MakeResourceHandle(
     const string& container, const string& name, const DeviceBase& device,
     const TypeIndex& type_index,
-    const std::vector<DtypeAndPartialTensorShape>& dtypes_and_shapes = {},
-    const std::vector<string>& allowed_devices = {}) TF_MUST_USE_RESULT;
+    const std::vector<DtypeAndPartialTensorShape>& dtypes_and_shapes = {})
+    TF_MUST_USE_RESULT;
 
 template <typename T>
 ResourceHandle MakeResourceHandle(
     OpKernelContext* ctx, const string& container, const string& name,
-    const std::vector<DtypeAndPartialTensorShape>& dtypes_and_shapes = {},
-    const std::vector<string>& allowed_devices = {}) {
-  return MakeResourceHandle(container.empty()
-                                ? ctx->resource_manager()->default_container()
-                                : container,
-                            name, *ctx->device(), MakeTypeIndex<T>(),
-                            dtypes_and_shapes, allowed_devices);
+    const std::vector<DtypeAndPartialTensorShape>& dtypes_and_shapes = {}) {
+  return MakeResourceHandle(
+      container.empty() ? ctx->resource_manager()->default_container()
+                        : container,
+      name, *ctx->device(), MakeTypeIndex<T>(), dtypes_and_shapes);
 }
 
 template <typename T>
 ResourceHandle MakeResourceHandle(
     OpKernelConstruction* ctx, const string& container, const string& name,
-    const std::vector<DtypeAndPartialTensorShape>& dtypes_and_shapes = {},
-    const std::vector<string>& allowed_devices = {}) {
-  return MakeResourceHandle(container.empty()
-                                ? ctx->resource_manager()->default_container()
-                                : container,
-                            name, *ctx->device(), MakeTypeIndex<T>(),
-                            dtypes_and_shapes, allowed_devices);
+    const std::vector<DtypeAndPartialTensorShape>& dtypes_and_shapes = {}) {
+  return MakeResourceHandle(
+      container.empty() ? ctx->resource_manager()->default_container()
+                        : container,
+      name, *ctx->device(), MakeTypeIndex<T>(), dtypes_and_shapes);
 }
 
 Status MakeResourceHandleToOutput(OpKernelContext* context, int output_index,
diff --git a/tensorflow/core/framework/resource_mgr_test.cc b/tensorflow/core/framework/resource_mgr_test.cc
index a48024123a6..f524ff77c11 100644
--- a/tensorflow/core/framework/resource_mgr_test.cc
+++ b/tensorflow/core/framework/resource_mgr_test.cc
@@ -352,51 +352,4 @@ TEST(ResourceHandleTest, DeleteUsingResourceHandle) {
   EXPECT_NE(LookupResource<StubResource>(&ctx, p, &lookup_r).ok(), true);
 }
 
-TEST(ResourceHandleTest, AllowedDevices) {
-  const std::vector<string> device_names = {
-      "/job:worker/replica:0/task:0/device:CPU:0",
-      "/job:worker/replica:0/task:0/device:CPU:2",
-      "/job:worker/replica:1/task:3/device:CPU:5"};
-  std::vector<StubDevice> devices;
-  for (const string& name : device_names) {
-    devices.emplace_back(name);
-  }
-
-  std::vector<OpKernelContext::Params> params(device_names.size());
-  std::vector<std::unique_ptr<ResourceMgr>> resource_mgrs;
-  std::vector<std::unique_ptr<OpKernelContext>> ctxs;
-  for (int i = 0; i < device_names.size(); ++i) {
-    resource_mgrs.emplace_back(
-        absl::make_unique<ResourceMgr>(/* default_container= */ ""));
-    params[i].resource_manager = resource_mgrs[i].get();
-    params[i].device = &(devices[i]);
-    ctxs.emplace_back(
-        absl::make_unique<OpKernelContext>(&(params[i]), /* num_outputs= */ 0));
-  }
-
-  const string partially_specified_name =
-      "/job:worker/replica:0/task:0/device:CPU:*";
-  const string& fully_specified_name = device_names.at(2);
-  const std::vector<string> allowed_devices = {partially_specified_name,
-                                               fully_specified_name};
-  // Create a ResourceHandle on device 0.
-  ResourceHandle p = MakeResourceHandle<StubResource>(
-      ctxs[0].get(), "container", "name",
-      /* dtypes_and_shapes= */ {}, allowed_devices);
-
-  std::vector<StubResource*> resources;
-  for (const auto& ctx : ctxs) {
-    StubResource* r = new StubResource;
-    TF_EXPECT_OK(CreateResource(ctx.get(), p, r));
-    resources.push_back(r);
-  }
-
-  for (int i = 0; i < ctxs.size(); ++i) {
-    core::RefCountPtr<StubResource> lookup_r;
-    TF_EXPECT_OK(LookupResource<StubResource>(ctxs[i].get(), p, &lookup_r));
-    EXPECT_EQ(lookup_r.get(), resources[i]);
-    TF_EXPECT_OK(DeleteResource(ctxs[i].get(), p));
-  }
-}
-
 }  // end namespace tensorflow
diff --git a/tensorflow/core/framework/type_index.h b/tensorflow/core/framework/type_index.h
index fd27d8bcb35..fcf68677a12 100644
--- a/tensorflow/core/framework/type_index.h
+++ b/tensorflow/core/framework/type_index.h
@@ -24,6 +24,10 @@ limitations under the License.
 
 #include "tensorflow/core/platform/types.h"
 
+#if defined(MACOS) || defined(TARGET_OS_MAC)
+#include "tensorflow/core/platform/hash.h"
+#endif  // defined(MACOS) || defined(TARGET_OS_MAC)
+
 namespace tensorflow {
 
 // On some platforms, we would like to avoid using RTTI in order to have smaller
@@ -53,10 +57,33 @@ class TypeIndex {
 
   // Returns a TypeIndex object that corresponds to a typename.
   template <typename T>
-  static TypeIndex Make(const char* name) {
+  static TypeIndex Make() {
     static bool hash_bit[1];
+
+#if defined(__GXX_RTTI) || defined(_CPPRTTI)
+
+#if defined(MACOS) || defined(TARGET_OS_MAC)
+    // Use a hash based on the type name to avoid issues due to RTLD_LOCAL on
+    // MacOS (b/156979412).
+    return TypeIndex(Hash64(typeid(T).name()), typeid(T).name());
+#else
+    // Use the real type name if we have RTTI.
     return TypeIndex(static_cast<uint64>(reinterpret_cast<intptr_t>(hash_bit)),
-                     name);
+                     typeid(T).name());
+#endif  // defined(MACOS) || defined(TARGET_OS_MAC)
+
+#else
+#if defined(MACOS) || defined(TARGET_OS_MAC)
+    // Warn MacOS users that not using RTTI can cause problems (b/156979412).
+#warning \
+    "Compiling with RTTI disabled on MacOS can cause problems when comparing " \
+    "types across shared libraries."
+#endif  // defined(MACOS) || defined(TARGET_OS_MAC)
+
+    // No type names available.
+    return TypeIndex(static_cast<uint64>(reinterpret_cast<intptr_t>(hash_bit)),
+                     "[RTTI disabled]");
+#endif  // __GXX_RTTI
   }
 
  private:
@@ -70,12 +97,7 @@ class TypeIndex {
 
 template <typename T>
 inline TypeIndex MakeTypeIndex() {
-#if defined(__GXX_RTTI) || defined(_CPPRTTI)
-  // Use the real type name if we have RTTI.
-  return TypeIndex::Make<T>(typeid(T).name());
-#else
-  return TypeIndex::Make<T>("[RTTI disabled]");
-#endif  // __GXX_RTTI
+  return TypeIndex::Make<T>();
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/framework/types.cc b/tensorflow/core/framework/types.cc
index 97eaec98ffe..d6455e012d0 100644
--- a/tensorflow/core/framework/types.cc
+++ b/tensorflow/core/framework/types.cc
@@ -238,11 +238,6 @@ int DataTypeSize(DataType dt) {
     TF_CALL_qint16(CASE);
     TF_CALL_quint16(CASE);
 
-    // uint32 and uint64 aren't included in TF_CALL_POD_TYPES because we
-    // don't want to define kernels for them at this stage to avoid binary
-    // bloat.
-    TF_CALL_uint32(CASE);
-    TF_CALL_uint64(CASE);
     default:
       return 0;
   }
diff --git a/tensorflow/core/grappler/costs/graph_memory.cc b/tensorflow/core/grappler/costs/graph_memory.cc
index 020e8cf1d1f..db252d16c49 100644
--- a/tensorflow/core/grappler/costs/graph_memory.cc
+++ b/tensorflow/core/grappler/costs/graph_memory.cc
@@ -255,7 +255,8 @@ void GraphMemory::InferFromTrace(const StepStats& timeline) {
     std::unordered_set<const LiveTensor*> live_at_peak;
     size_t current = 0;
     std::unordered_set<const LiveTensor*> currently_live;
-    for (int i = 0; i < events.size(); ++i) {
+    int events_size = events.size();
+    for (int i = 0; i < events_size; ++i) {
       const auto& event = events[i];
 
       if (event.allocated) {
@@ -271,8 +272,7 @@ void GraphMemory::InferFromTrace(const StepStats& timeline) {
         current -= event.tensor->memory_used;
         currently_live.erase(event.tensor);
       }
-      if (i + 1 == events.size() ||
-          event.timestamp != events[i + 1].timestamp) {
+      if (i + 1 == events_size || event.timestamp != events[i + 1].timestamp) {
         if (current > peak) {
           peak = current;
           live_at_peak = currently_live;
diff --git a/tensorflow/core/grappler/costs/graph_properties.cc b/tensorflow/core/grappler/costs/graph_properties.cc
index ee691e7a081..36e530916a3 100644
--- a/tensorflow/core/grappler/costs/graph_properties.cc
+++ b/tensorflow/core/grappler/costs/graph_properties.cc
@@ -363,7 +363,7 @@ void VerboseLogUnknownDimensionSources(
 std::vector<ShapeHandle> ReplaceUnknownDimFromConstWithUnknownDim(
     InferenceContext* ic, const std::vector<ShapeHandle>& shapes) {
   std::vector<ShapeHandle> converted_shapes(shapes.size());
-  for (int i = 0; i < shapes.size(); i++) {
+  for (int i = 0, shapes_size = shapes.size(); i < shapes_size; i++) {
     const auto& shape = shapes[i];
     if (!ic->RankKnown(shape)) {
       converted_shapes[i] = shape;
@@ -502,7 +502,8 @@ class TopoQueue {
       const std::vector<const NodeDef*>& topo_order) const {
     absl::flat_hash_map<const NodeDef*, int> map;
     map.reserve(topo_order.size());
-    for (int i = 0; i < topo_order.size(); ++i) {
+    for (int i = 0, topo_order_size = topo_order.size(); i < topo_order_size;
+         ++i) {
       map.emplace(topo_order[i], i);
     }
     return map;
@@ -680,14 +681,17 @@ class SymbolicShapeRefiner {
                         ", shape: ", ic->DebugString(ic->input(i)),
                         ", tensor: ");
         Tensor t1;
-        if (input_tensor_protos.size() > i &&
+        int input_tensor_protos_size = input_tensor_protos.size();
+        if (input_tensor_protos_size > i &&
             input_tensor_protos.at(i) != nullptr &&
             t1.FromProto(*input_tensor_protos.at(i))) {
           absl::StrAppend(&output, t1.DebugString(), ", tensor_as_shape: ");
         } else {
           absl::StrAppend(&output, " null, tensor_as_shape: ");
         }
-        if (input_tensors_as_shapes_to_propagate.size() > i) {
+        int input_tensors_as_shapes_to_propagate_size =
+            input_tensors_as_shapes_to_propagate.size();
+        if (input_tensors_as_shapes_to_propagate_size > i) {
           absl::StrAppend(
               &output,
               StringifyShapeHandle(input_tensors_as_shapes_to_propagate.at(i)),
@@ -702,14 +706,16 @@ class SymbolicShapeRefiner {
                         ", shape: ", ic->DebugString(ic->output(i)),
                         ", tensor: ");
         Tensor t2;
-        if (output_tensor_protos.size() > i &&
+        int output_tensor_protos_size = output_tensor_protos.size();
+        if (output_tensor_protos_size > i &&
             output_tensor_protos.at(i) != nullptr &&
             t2.FromProto(*output_tensor_protos.at(i))) {
           absl::StrAppend(&output, t2.DebugString(), ", tensor_as_shape: ");
         } else {
           absl::StrAppend(&output, " null, tensor_as_shape: ");
         }
-        if (output_tensors_as_shapes.size() > i) {
+        int output_tensors_as_shapes_size = output_tensors_as_shapes.size();
+        if (output_tensors_as_shapes_size > i) {
           absl::StrAppend(&output,
                           StringifyShapeHandle(output_tensors_as_shapes.at(i)),
                           "\n");
@@ -779,7 +785,8 @@ class SymbolicShapeRefiner {
     MutableGraphView gv(&grappler_function_item.graph);
 
     // Forward shapes from function input nodes to argument nodes.
-    for (int i = 0; i < grappler_function_item.inputs().size(); ++i) {
+    for (int i = 0, iter_limit = grappler_function_item.inputs().size();
+         i < iter_limit; ++i) {
       auto& fun_input = grappler_function_item.input(i);
       NodeDef* fun_node = gv.GetNode(fun_input.node_name);
       const TensorId input_tensor = ParseTensorName(function_node->input(i));
@@ -858,13 +865,13 @@ class SymbolicShapeRefiner {
       if (IsConstant(*input_node)) {
         TF_CHECK_OK(
             ReplaceInputWithConst(*input_node, i, &grappler_function_item));
-      } else if (ctx->input_tensor_protos.size() > i &&
+      } else if (static_cast<int>(ctx->input_tensor_protos.size()) > i &&
                  ctx->input_tensor_protos[i] != nullptr) {
         NodeDef const_input_node = MakeConstNodeDefFromTensorProto(
             ic, *ctx->input_tensor_protos[i], ctx->input_types[i]);
         TF_CHECK_OK(ReplaceInputWithConst(const_input_node, i,
                                           &grappler_function_item));
-      } else if (ic->input_tensors_as_shapes().size() > i &&
+      } else if (static_cast<int>(ic->input_tensors_as_shapes().size()) > i &&
                  IsShapeFullyDefinedIntegerVectorOrScalar(
                      ic, ic->input(i), ic->input_tensors_as_shapes()[i],
                      ctx->input_types[i])) {
@@ -912,7 +919,8 @@ class SymbolicShapeRefiner {
       }
 
       auto output_properties = gp.GetOutputProperties(retnode->name());
-      if (out_tensor.index() >= output_properties.size()) {
+      int output_properties_size = output_properties.size();
+      if (out_tensor.index() >= output_properties_size) {
         return errors::InvalidArgument(
             out_tensor.ToString(), " has invalid position ", out_tensor.index(),
             " (output_properties.size() = ", output_properties.size(), ").");
@@ -975,12 +983,13 @@ class SymbolicShapeRefiner {
       // NodeContext:
       // output_tensor_protos to input_tensor_protos and input_tensors, and
       // output_tensors_as_shapes to input_tensors_as_shapes.
-      if (src_ctx->output_tensors_as_shapes.size() > src_output) {
+      if (static_cast<int>(src_ctx->output_tensors_as_shapes.size()) >
+          src_output) {
         ctx->input_tensors_as_shapes_to_propagate[dst_input] =
             src_ctx->output_tensors_as_shapes[src_output];
       }
 
-      if (src_ctx->output_tensor_protos.size() > src_output) {
+      if (static_cast<int>(src_ctx->output_tensor_protos.size()) > src_output) {
         const auto* tensor_proto = src_ctx->output_tensor_protos[src_output];
         if (tensor_proto != nullptr) {
           ctx->input_tensor_protos[dst_input] = tensor_proto;
@@ -1233,7 +1242,7 @@ class SymbolicShapeRefiner {
     if (st1.size() != st2.size()) {
       return false;
     }
-    for (int i = 0; i < st1.size(); ++i) {
+    for (int i = 0, st1_size = st1.size(); i < st1_size; ++i) {
       const ShapeAndType& s1 = st1[i];
       const ShapeAndType& s2 = st2[i];
       if (s1.dtype != s2.dtype) {
@@ -1268,13 +1277,15 @@ class SymbolicShapeRefiner {
       return Status::OK();
     }
 
-    if (grappler_function_item.inputs().size() > function_node->input_size()) {
+    if (static_cast<int>(grappler_function_item.inputs().size()) >
+        function_node->input_size()) {
       return errors::FailedPrecondition(
           "Function input size should be smaller than node input size.");
     }
 
-    for (int i = grappler_function_item.inputs().size();
-         i < function_node->input_size(); ++i) {
+    for (int i = grappler_function_item.inputs().size(),
+             iter_limit = function_node->input_size();
+         i < iter_limit; ++i) {
       const string& input = function_node->input(i);
       if (!IsControlInput(input)) {
         return errors::FailedPrecondition(
@@ -1357,18 +1368,20 @@ class SymbolicShapeRefiner {
   // Returns true if all the output tensors have known values.
   bool AllOutputValuesKnown(NodeContext* c) {
     InferenceContext* ic = c->inference_context.get();
-    if (c->output_tensors_as_shapes.size() < ic->num_outputs() &&
-        c->output_tensor_protos.size() < ic->num_outputs()) {
+    int c_output_tensors_as_shapes_size = c->output_tensors_as_shapes.size();
+    int c_output_tensor_protos_size = c->output_tensor_protos.size();
+    if (c_output_tensors_as_shapes_size < ic->num_outputs() &&
+        c_output_tensor_protos_size < ic->num_outputs()) {
       return false;
     } else {
       // Checks if we can get output value via either output_tensor_proto or
       // output_tensors_as_shapes.
       for (int i = 0; i < ic->num_outputs(); i++) {
-        if (c->output_tensor_protos.size() > i &&
+        if (c_output_tensor_protos_size > i &&
             c->output_tensor_protos[i] != nullptr) {
           continue;
         }
-        if (c->output_tensors_as_shapes.size() > i &&
+        if (c_output_tensors_as_shapes_size > i &&
             ic->FullyDefined(c->output_tensors_as_shapes[i])) {
           bool no_unknown_dim_from_const = true;
           for (int32 j = 0; j < ic->Rank(c->output_tensors_as_shapes[i]); ++j) {
@@ -1539,7 +1552,7 @@ class SymbolicShapeRefiner {
                                     &resource_mgr_, &outputs));
     c->output_tensors_as_shapes.resize(outputs.size());
     c->output_tensor_protos.resize(outputs.size(), nullptr);
-    for (int k = 0; k < outputs.size(); k++) {
+    for (int k = 0, outputs_size = outputs.size(); k < outputs_size; k++) {
       const auto& t = outputs[k];
       // Override output shape.
       ShapeHandle output_shape;
@@ -2297,7 +2310,7 @@ Status GraphProperties::UpdateEnqueue(
 
   // TODO(bsteiner): handle EnqueueMany as well.
   std::vector<ShapeAndType> shapes_and_types;
-  for (int i = 1; i < ctx->input_types.size(); ++i) {
+  for (int i = 1, iter_limit = ctx->input_types.size(); i < iter_limit; ++i) {
     GraphView::InputPort inp(enqueue_node, i);
     GraphView::OutputPort fanin = shape_refiner->graph().GetRegularFanin(inp);
     InferenceContext* in = shape_refiner->GetContext(fanin.node);
@@ -2490,10 +2503,11 @@ Status GraphProperties::InferStatically(bool assume_valid_feeds,
             const TensorProto& raw_val =
                 fanin.node->attr().at("value").tensor();
             *input_properties[i].mutable_value() = raw_val;
-          } else if (ctx->input_tensor_protos.size() > i &&
+          } else if (static_cast<int>(ctx->input_tensor_protos.size()) > i &&
                      ctx->input_tensor_protos[i] != nullptr) {
             *input_properties[i].mutable_value() = *ctx->input_tensor_protos[i];
-          } else if (ic->input_tensors_as_shapes().size() > i &&
+          } else if (static_cast<int>(ic->input_tensors_as_shapes().size()) >
+                         i &&
                      IsShapeFullyDefinedIntegerVectorOrScalar(
                          ic, ic->input(i), ic->input_tensors_as_shapes()[i],
                          ctx->input_types[i])) {
@@ -2525,11 +2539,12 @@ Status GraphProperties::InferStatically(bool assume_valid_feeds,
             // TODO(rmlarsen): Eliminate this copy.
             const TensorProto& raw_val = node.attr().at("value").tensor();
             *output_properties[i].mutable_value() = raw_val;
-          } else if (ctx->output_tensor_protos.size() > i &&
+          } else if (static_cast<int>(ctx->output_tensor_protos.size()) > i &&
                      ctx->output_tensor_protos[i] != nullptr) {
             *output_properties[i].mutable_value() =
                 *ctx->output_tensor_protos[i];
-          } else if (converted_output_tensors_as_shapes.size() > i &&
+          } else if (static_cast<int>(
+                         converted_output_tensors_as_shapes.size()) > i &&
                      IsShapeFullyDefinedIntegerVectorOrScalar(
                          ic, ic->output(i),
                          converted_output_tensors_as_shapes[i],
diff --git a/tensorflow/core/grappler/costs/op_level_cost_estimator.cc b/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
index b8b62cbd6e5..6f57708a780 100644
--- a/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
+++ b/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
@@ -1470,8 +1470,8 @@ Costs OpLevelCostEstimator::PredictEinsum(const OpContext& op_context) const {
                          (a_input.shape().dim_size() < matrix_rank) ||
                          (b_input.shape().dim_size() < matrix_rank);
 
-  if (a_input_str.size() != a_input_shape.dim_size() ||
-      b_input_str.size() != b_input_shape.dim_size()) {
+  if (a_input_str.size() != static_cast<size_t>(a_input_shape.dim_size()) ||
+      b_input_str.size() != static_cast<size_t>(b_input_shape.dim_size())) {
     VLOG(1) << "Missing accurate estimator for op: " << op_info.op()
             << ", equation subscripts don't match tensor rank.";
     return PredictCostOfAnUnknownOp(op_context);
@@ -1513,7 +1513,8 @@ Costs OpLevelCostEstimator::PredictEinsum(const OpContext& op_context) const {
   n_dim.set_size(1);
   k_dim.set_size(1);
 
-  for (int i_idx = 0; i_idx < a_input_str.size(); ++i_idx) {
+  for (int i_idx = 0, a_input_str_size = a_input_str.size();
+       i_idx < a_input_str_size; ++i_idx) {
     if (b_input_str.find(a_input_str[i_idx]) == std::string::npos) {
       if (rhs_str.find(a_input_str[i_idx]) == std::string::npos) {
         VLOG(1) << "Missing accurate estimator for op: " << op_info.op();
@@ -1533,7 +1534,8 @@ Costs OpLevelCostEstimator::PredictEinsum(const OpContext& op_context) const {
     *(a_matrix_shape->add_dim()) = a_input_shape.dim(i_idx);
     *(b_matrix_shape->add_dim()) = a_input_shape.dim(i_idx);
   }
-  for (int i_idx = 0; i_idx < b_input_str.size(); ++i_idx) {
+  for (int i_idx = 0, b_input_str_size = b_input_str.size();
+       i_idx < b_input_str_size; ++i_idx) {
     if (a_input_str.find(b_input_str[i_idx]) == std::string::npos) {
       if (rhs_str.find(b_input_str[i_idx]) == std::string::npos) {
         VLOG(1) << "Missing accurate estimator for op: " << op_info.op();
diff --git a/tensorflow/core/grappler/costs/virtual_scheduler.cc b/tensorflow/core/grappler/costs/virtual_scheduler.cc
index 5339b00627e..67af304b081 100644
--- a/tensorflow/core/grappler/costs/virtual_scheduler.cc
+++ b/tensorflow/core/grappler/costs/virtual_scheduler.cc
@@ -522,8 +522,9 @@ Status SchedulerState::Init(const GrapplerItem* item,
 
     if (IsPersistent(*curr_node)) {
       auto& device_state = device_[curr_node_device];
-      for (int port_num = 0;
-           port_num < curr_node_state.output_properties.size(); ++port_num) {
+      for (int port_num = 0,
+               port_num_iter_limit = curr_node_state.output_properties.size();
+           port_num < port_num_iter_limit; ++port_num) {
         device_state.persistent_nodes.insert(
             std::make_pair(curr_node, port_num));
       }
@@ -795,7 +796,8 @@ void SchedulerState::GetOutputNodes(const NodeDef* node,
       // Execute a node as soon as all its inputs are ready. Merge nodes are
       // special since they run as soon as one of their inputs becomes
       // available.
-      if (output_state.num_inputs_ready == output_state.inputs.size() ||
+      int output_state_inputs_size = output_state.inputs.size();
+      if (output_state.num_inputs_ready == output_state_inputs_size ||
           IsMerge(*output_node)) {
         // This output node is now ready.
         output_state.time_ready = curr_time;
@@ -900,8 +902,8 @@ std::vector<const NodeDef*> SchedulerState::MarkNodeExecuted(
     auto port = input_port.second;
     auto& input_state = node_map_[input];
     input_state.num_outputs_executed[port]++;
-    if (input_state.num_outputs_executed[port] ==
-            input_state.outputs[port].size() &&
+    int input_state_outputs_size_ = input_state.outputs[port].size();
+    if (input_state.num_outputs_executed[port] == input_state_outputs_size_ &&
         !IsPersistent(*input)) {
       // All the outputs are executed; no reference to this output port of
       // input node.
@@ -1119,7 +1121,8 @@ void SchedulerState::GenerateRunMetadata(RunMetadata* metadata) {
       const NodeState& nodestate = node_map_.at(node_def);
       NodeExecStats* node_stats = device_stepstats->add_node_stats();
       uint64 total_output_size = 0;
-      for (int slot = 0; slot < nodestate.output_properties.size(); slot++) {
+      for (int slot = 0, slot_iter_limit = nodestate.output_properties.size();
+           slot < slot_iter_limit; slot++) {
         const auto& properties = nodestate.output_properties[slot];
         NodeOutput* no = node_stats->add_output();
         no->set_slot(slot);
diff --git a/tensorflow/core/grappler/optimizers/BUILD b/tensorflow/core/grappler/optimizers/BUILD
index 7432e2d54ea..5f19398bf89 100644
--- a/tensorflow/core/grappler/optimizers/BUILD
+++ b/tensorflow/core/grappler/optimizers/BUILD
@@ -1,11 +1,14 @@
-load("//tensorflow:tensorflow.bzl", "tf_cc_test", "tf_cc_test_mkl", "tf_kernel_library")
-load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_test")
+load("//tensorflow:tensorflow.bzl", "tf_cc_test", "tf_cc_test_mkl", "tf_copts", "tf_cuda_cc_test", "tf_kernel_library")
 
 # Platform specific build config
 load(
     "//tensorflow/core/platform:build_config_root.bzl",
     "if_static",
 )
+load(
+    "//third_party/mkl:build_defs.bzl",
+    "mkl_deps",
+)
 
 package(
     licenses = ["notice"],  # Apache 2.0
@@ -610,6 +613,7 @@ cc_library(
         "auto_mixed_precision.h",
         "auto_mixed_precision_lists.h",
     ],
+    copts = tf_copts(),
     visibility = ["//visibility:public"],
     deps = [
         ":custom_graph_optimizer_registry",
@@ -627,7 +631,7 @@ cc_library(
         "//tensorflow/core/grappler/costs:virtual_placer",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
-    ],
+    ] + mkl_deps(),
 )
 
 tf_cuda_cc_test(
diff --git a/tensorflow/core/grappler/optimizers/auto_mixed_precision.cc b/tensorflow/core/grappler/optimizers/auto_mixed_precision.cc
index fa6ca3144a5..f20c4eea0c9 100644
--- a/tensorflow/core/grappler/optimizers/auto_mixed_precision.cc
+++ b/tensorflow/core/grappler/optimizers/auto_mixed_precision.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/core/grappler/optimizers/auto_mixed_precision.h"
 
 #include <fstream>
+#include <memory>
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
@@ -52,6 +53,7 @@ const std::pair<int, int> kMinGPUArch = {0, 0};
 
 const char kSuffix[] = "AutoMixedPrecision";
 const char kCastToFp16[] = "CastToFp16";
+const char kCastToBf16[] = "CastToBf16";
 const char kCastToFp32[] = "CastToFp32";
 
 // Instances of this class represent unique type attribute identifiers within a
@@ -840,22 +842,6 @@ DataTypeSet AllowedDataTypes(const OpDef& op_def, const TypeAttrId& t_attr_id) {
   return AllowedDataTypes(*attr_def);
 }
 
-NodeDef BuildCastNode(const MutableGraphView::OutputPort& src, bool to_fp16,
-                      const string& device) {
-  const char* cast_string = to_fp16 ? kCastToFp16 : kCastToFp32;
-  string name = strings::StrCat(src.node->name(), "-", src.port_id, "-",
-                                cast_string, "-", kSuffix);
-  NodeDef node;
-  node.set_name(name);
-  node.set_op("Cast");
-  node.set_device(device);
-  node.add_input(strings::StrCat(src.node->name(), ":", src.port_id));
-  (*node.mutable_attr())["SrcT"].set_type(to_fp16 ? DT_FLOAT : DT_HALF);
-  (*node.mutable_attr())["DstT"].set_type(to_fp16 ? DT_HALF : DT_FLOAT);
-  (*node.mutable_attr())["Truncate"].set_b(false);
-  return node;
-}
-
 Status ValidateLists(const gtl::FlatSet<string>& white_list,
                      const gtl::FlatSet<string>& black_list,
                      const gtl::FlatSet<string>& gray_list,
@@ -941,7 +927,8 @@ class AutoMixedPrecisionImpl {
  public:
   AutoMixedPrecisionImpl(Cluster* cluster,
                          const std::unordered_set<string>& nodes_to_preserve,
-                         GraphDef* graph, string id)
+                         GraphDef* graph, string id,
+                         AutoMixedPrecisionMode mode)
       : virtual_placer_(cluster->GetDevices()),
         nodes_to_preserve_(nodes_to_preserve),
         graph_(graph),
@@ -949,23 +936,35 @@ class AutoMixedPrecisionImpl {
         id_(id),
         graph_view_(graph),
         cuda_version_(GetCudaVersion(*cluster)),
-        cudnn_version_(GetCudnnVersion(*cluster)) {}
+        cudnn_version_(GetCudnnVersion(*cluster)),
+        mode_(mode),
+        target_dtype_(mode_ == AutoMixedPrecisionMode::CUDA ? DT_HALF
+                                                            : DT_BFLOAT16) {}
 
   Status Optimize();
 
  private:
   typedef absl::flat_hash_set<NodeTypeId> NodeTypeIdSet;
 
+  std::unique_ptr<AutoMixedPrecisionLists> get_mixed_precision_lists() const {
+    switch (mode_) {
+      case AutoMixedPrecisionMode::CUDA:
+        return std::make_unique<AutoMixedPrecisionListsCuda>(cuda_version_,
+                                                             cudnn_version_);
+      case AutoMixedPrecisionMode::MKL:
+        return std::make_unique<AutoMixedPrecisionListsMkl>();
+    }
+  }
   Status PrintDebugLogs(bool preop, size_t timestamp);
   void LogSkippedNode(const NodeDef& node) const;
   bool MustPreserve(const NodeDef& node) const;
-  bool IsOnGPU(const NodeDef& node) const;
+  bool IsOnDevice(const NodeDef& node, const string& device_type) const;
   bool IsOnSuitableGPUArch(const NodeDef& node) const;
   bool ShouldProcess(const NodeDef& node) const;
-  bool NodeHasFP16KernelForTypeAttr(const NodeDef& node, TypeAttrId taid) const;
+  bool NodeHasF16KernelForTypeAttr(const NodeDef& node, TypeAttrId taid) const;
   bool NodeImplicitlyReadsNonResourceVariable(const NodeDef& node) const;
   void ConvertBatchNormOpsToV2();
-  bool SupportsFloat16(const NodeTypeId& node_type) const;
+  bool SupportsF16(const NodeTypeId& node_type) const;
   const NodeTypeId* GetTensorListFloat32NodeTypeId(const NodeDef& node) const;
   bool IsSourceOrSinkOp(const string& op) const;
   void FindFloat32TensorListOpClustersAndBlacklistUnsafe(
@@ -990,6 +989,8 @@ class AutoMixedPrecisionImpl {
       absl::flat_hash_set<int>* white_set) const;
   void MakeCastsWhiteIfAllOutputsWhite(
       absl::flat_hash_set<int>* white_set) const;
+  NodeDef BuildCastNode(const MutableGraphView::OutputPort& src, bool to_f16,
+                        const string& device) const;
   Status ChangeTypeAttrsAndAddCasts(const absl::flat_hash_set<int>& white_set);
 
   VirtualPlacer virtual_placer_;
@@ -1003,21 +1004,44 @@ class AutoMixedPrecisionImpl {
   NodeTypeAttrMap node_type_map_;
   GraphTypeTopologyView graph_type_view_;
   bool force_all_fp16_;
-  gtl::FlatSet<string> fp16_whitelist_;
-  gtl::FlatSet<string> fp16_blacklist_;
-  gtl::FlatSet<string> fp16_graylist_;
-  gtl::FlatSet<string> fp16_clearlist_;
+  AutoMixedPrecisionMode mode_;
+  gtl::FlatSet<string> f16_whitelist_;
+  gtl::FlatSet<string> f16_blacklist_;
+  gtl::FlatSet<string> f16_graylist_;
+  gtl::FlatSet<string> f16_clearlist_;
   absl::flat_hash_set<const NodeDef*> should_process_nodes_;
+  DataType target_dtype_;  // Either DT_HALF or DT_BFLOAT16
 };
 
-bool AutoMixedPrecisionImpl::NodeHasFP16KernelForTypeAttr(
+NodeDef AutoMixedPrecisionImpl::BuildCastNode(
+    const MutableGraphView::OutputPort& src, bool to_f16,
+    const string& device) const {
+  DataType src_type = to_f16 ? DT_FLOAT : target_dtype_;
+  DataType dst_type = to_f16 ? target_dtype_ : DT_FLOAT;
+  const char* cast_string =
+      !to_f16 ? kCastToFp32
+              : target_dtype_ == DT_HALF ? kCastToFp16 : kCastToBf16;
+  string name = strings::StrCat(src.node->name(), "-", src.port_id, "-",
+                                cast_string, "-", kSuffix);
+  NodeDef node;
+  node.set_name(name);
+  node.set_op("Cast");
+  node.set_device(device);
+  node.add_input(strings::StrCat(src.node->name(), ":", src.port_id));
+  (*node.mutable_attr())["SrcT"].set_type(src_type);
+  (*node.mutable_attr())["DstT"].set_type(dst_type);
+  (*node.mutable_attr())["Truncate"].set_b(false);
+  return node;
+}
+
+bool AutoMixedPrecisionImpl::NodeHasF16KernelForTypeAttr(
     const NodeDef& node, TypeAttrId taid) const {
   NodeDef node_copy(node);
   if (node.device().empty()) {
     string device_name = virtual_placer_.get_canonical_device_name(node);
     node_copy.set_device(device_name);
   }
-  if (!SetDataType(&node_copy, taid, DataType::DT_HALF)) {
+  if (!SetDataType(&node_copy, taid, target_dtype_)) {
     return false;
   }
   return IsKernelRegisteredForNode(node_copy).ok();
@@ -1053,21 +1077,22 @@ Status AutoMixedPrecisionImpl::PrintDebugLogs(bool preop, size_t timestamp) {
     fname = io::JoinPath(prepend_path,
                          strings::StrCat("paintbuckets", suffix, ".txt"));
     f.open(fname.c_str(), std::fstream::out);
+    std::unique_ptr<AutoMixedPrecisionLists> mp_lists =
+        get_mixed_precision_lists();
     f << "WhiteList:\n";
-    for (const auto& x :
-         AutoMixedPrecisionLists::WhiteList(cuda_version_, cudnn_version_)) {
+    for (const auto& x : mp_lists->WhiteList()) {
       f << x << "\n";
     }
     f << "\nBlackList:\n";
-    for (const auto& x : AutoMixedPrecisionLists::BlackList()) {
+    for (const auto& x : mp_lists->BlackList()) {
       f << x << "\n";
     }
     f << "\nGrayList:\n";
-    for (const auto& x : AutoMixedPrecisionLists::GrayList()) {
+    for (const auto& x : mp_lists->GrayList()) {
       f << x << "\n";
     }
     f << "\nClearList:\n";
-    for (const auto& x : AutoMixedPrecisionLists::ClearList()) {
+    for (const auto& x : mp_lists->ClearList()) {
       f << x << "\n";
     }
     f.close();
@@ -1088,7 +1113,8 @@ bool AutoMixedPrecisionImpl::MustPreserve(const NodeDef& node) const {
   return nodes_to_preserve_.count(node.name());
 }
 
-bool AutoMixedPrecisionImpl::IsOnGPU(const NodeDef& node) const {
+bool AutoMixedPrecisionImpl::IsOnDevice(const NodeDef& node,
+                                        const string& device_type) const {
   string device_name;
   if (node.device().empty()) {
     device_name = virtual_placer_.get_canonical_device_name(node);
@@ -1099,7 +1125,7 @@ bool AutoMixedPrecisionImpl::IsOnGPU(const NodeDef& node) const {
   string not_used;
   if (DeviceNameUtils::SplitDeviceName(device_name, &not_used, &device) &&
       absl::StrContains(absl::AsciiStrToLower(device),
-                        absl::AsciiStrToLower(DEVICE_GPU))) {
+                        absl::AsciiStrToLower(device_type))) {
     return true;
   }
   return false;
@@ -1164,15 +1190,14 @@ bool IsTensorListWriterOp(const string& op) {
   return tensor_list_writer_ops.count(op);
 }
 
-bool AutoMixedPrecisionImpl::SupportsFloat16(
-    const NodeTypeId& node_type) const {
+bool AutoMixedPrecisionImpl::SupportsF16(const NodeTypeId& node_type) const {
   const OpDef* op_def;
   Status status =
       OpRegistry::Global()->LookUpOpDef(node_type.node->op(), &op_def);
   if (!status.ok()) return false;
   return AllowedDataTypes(*op_def, node_type.type_attr)
-             .Contains(DataType::DT_HALF) &&
-         NodeHasFP16KernelForTypeAttr(*node_type.node, node_type.type_attr);
+             .Contains(target_dtype_) &&
+         NodeHasF16KernelForTypeAttr(*node_type.node, node_type.type_attr);
 }
 
 // TODO(mconley): Make this change the node's name (to aid debugging). Need to
@@ -1219,22 +1244,40 @@ Status AutoMixedPrecisionImpl::Optimize() {
       "TF_AUTO_MIXED_PRECISION_GRAPH_REWRITE_LEVEL", "", &optimization_level));
   optimization_level = absl::AsciiStrToUpper(optimization_level);
   force_all_fp16_ = optimization_level == "UNSAFE_FORCE_ALL";
+  if (force_all_fp16_ && mode_ == AutoMixedPrecisionMode::MKL) {
+    // Many ops do not support bfloat16 on the CPU so we disallowing forcing to
+    // bfloat16.
+    return errors::InvalidArgument(
+        "TF_AUTO_MIXED_PRECISION_GRAPH_REWRITE_LEVEL cannot be set to "
+        "UNSAFE_FORCE_ALL when MKL is used");
+  }
 
-  fp16_whitelist_ =
-      AutoMixedPrecisionLists::WhiteList(cuda_version_, cudnn_version_);
-  fp16_blacklist_ = AutoMixedPrecisionLists::BlackList();
-  fp16_graylist_ = AutoMixedPrecisionLists::GrayList();
-  fp16_clearlist_ = AutoMixedPrecisionLists::ClearList();
-  TF_RETURN_IF_ERROR(ValidateLists(fp16_whitelist_, fp16_blacklist_,
-                                   fp16_graylist_, fp16_clearlist_));
+  std::unique_ptr<AutoMixedPrecisionLists> mp_lists =
+      get_mixed_precision_lists();
+  f16_whitelist_ = mp_lists->WhiteList();
+  f16_blacklist_ = mp_lists->BlackList();
+  f16_graylist_ = mp_lists->GrayList();
+  f16_clearlist_ = mp_lists->ClearList();
+  TF_RETURN_IF_ERROR(ValidateLists(f16_whitelist_, f16_blacklist_,
+                                   f16_graylist_, f16_clearlist_));
 
   size_t timestamp = Env::Default()->NowMicros() / 1000;
   TF_RETURN_IF_ERROR(PrintDebugLogs(/* preop = */ true, timestamp));
 
   VLOG(2) << "Identifying nodes that should be processed";
   for (const NodeDef& node : graph_->node()) {
-    if (!MustPreserve(node) && IsOnGPU(node) &&
-        (ShouldIgnorePerformance() || IsOnSuitableGPUArch(node))) {
+    bool should_process;
+    switch (mode_) {
+      case AutoMixedPrecisionMode::CUDA:
+        should_process =
+            !MustPreserve(node) && IsOnDevice(node, DEVICE_GPU) &&
+            (ShouldIgnorePerformance() || IsOnSuitableGPUArch(node));
+        break;
+      case AutoMixedPrecisionMode::MKL:
+        should_process = !MustPreserve(node) && IsOnDevice(node, DEVICE_CPU);
+        break;
+    }
+    if (should_process) {
       should_process_nodes_.insert(&node);
     } else {
       LogSkippedNode(node);
@@ -1260,29 +1303,29 @@ Status AutoMixedPrecisionImpl::Optimize() {
   for (const auto& cluster : tensor_list_clusters) {
     VLOG(1) << "Found safe Tensor List cluster of size " << cluster.size();
     for (const NodeDef* node : cluster) {
-      VLOG(2) << "Cluster member: " << node->op() << " node " << node->name();
+      VLOG(2) << "  Cluster member: " << node->op() << " node " << node->name();
     }
     FindTensorListImplicitFloat32Edges(cluster, &ephemeral_edges);
   }
   TF_RETURN_IF_ERROR(graph_type_view_.AddEphemeralEdges(ephemeral_edges));
 
-  // The goal here is to change performance-critical ops to fp16, and to do so
-  // with the minimal number of casts, subject to the constraint that the
+  // The goal here is to change performance-critical ops to fp16 or bf16, and to
+  // do so with the minimal number of casts, subject to the constraint that the
   // model's convergence is not affected. This is achieved by first identifying
-  // which nodes should be changed to fp16 and then inserting casts at the
-  // boundaries between fp16/non-fp16 nodes.
+  // which nodes should be changed to f16 and then inserting casts at the
+  // boundaries between f16/non-f16 nodes.
 
-  // The algorithm for deciding which nodes to change to fp16 is as follows:
+  // The algorithm for deciding which nodes to change to f16 is as follows:
   // 1) Add all performance-critical ops (aka "whitelist" ops) to the white_set.
   //    This is done under the assumption that whitelist ops are always
-  //    numerically-safe in fp16 and that they are the most important ops for
+  //    numerically-safe in f16 and that they are the most important ops for
   //    improving performance.
   // 2) Add nodes to the black_set iff they are numerically-dangerous (aka
   //    "blacklist" ops) or they are on a forward path from a blacklist node to
   //    a black/gray node (including the node at the end of the path) through
   //    non-numerically-dangerous ops (aka "greylist" and "clearlist" ops).
   //    This is done to prevent numerically-dangerous ops and their downstream
-  //    effects from being changed to fp16, which would risk breaking the
+  //    effects from being changed to f16, which would risk breaking the
   //    numerical accuracy of the model.
   // 3) For all remaining nodes that are not considered dangerous (greylist
   //    and clearlist ops), find those that are between (i.e., both upstream
@@ -1480,7 +1523,7 @@ void AutoMixedPrecisionImpl::AddWhitelistOps(
     const NodeTypeId& root = *graph_type_view_.GetNode(root_idx);
     if (!ShouldProcess(*root.node)) continue;
     bool force_white = force_all_fp16_ && CanForceFP16(*root.node);
-    if (fp16_whitelist_.count(root.node->op()) || force_white) {
+    if (f16_whitelist_.count(root.node->op()) || force_white) {
       bool inserted = white_set->insert(root_idx).second;
       if (VLOG_IS_ON(2) && inserted) {
         VLOG(2) << "Painting type " << root.type_attr.DebugString()
@@ -1504,8 +1547,8 @@ void AutoMixedPrecisionImpl::PropagateBlackFwdThroughClearAndGray(
   absl::flat_hash_set<int> upstream_of_black_or_gray_set;
   for (int root_idx = 0; root_idx < graph_type_view_.num_nodes(); ++root_idx) {
     const NodeTypeId& root = *graph_type_view_.GetNode(root_idx);
-    if (!(fp16_blacklist_.count(root.node->op()) ||
-          fp16_graylist_.count(root.node->op()))) {
+    if (!(f16_blacklist_.count(root.node->op()) ||
+          f16_graylist_.count(root.node->op()))) {
       continue;
     }
     DfsTypeTraversal(graph_type_view_, {&root},
@@ -1514,7 +1557,7 @@ void AutoMixedPrecisionImpl::PropagateBlackFwdThroughClearAndGray(
                        const NodeTypeId& item = *graph_type_view_.GetNode(idx);
                        return idx == root_idx ||
                               (!upstream_of_black_or_gray_set.count(idx) &&
-                               fp16_clearlist_.count(item.node->op()));
+                               f16_clearlist_.count(item.node->op()));
                      }),
                      DfsTypeCallbacks::PreOrder([&](int idx) {
                        upstream_of_black_or_gray_set.insert(idx);
@@ -1524,7 +1567,7 @@ void AutoMixedPrecisionImpl::PropagateBlackFwdThroughClearAndGray(
   // Propagate black forward through nodes in upstream_of_black_or_gray_set.
   for (int root_idx = 0; root_idx < graph_type_view_.num_nodes(); ++root_idx) {
     const NodeTypeId& root = *graph_type_view_.GetNode(root_idx);
-    if (black_set->count(root_idx) || !fp16_blacklist_.count(root.node->op())) {
+    if (black_set->count(root_idx) || !f16_blacklist_.count(root.node->op())) {
       continue;
     }
     DfsTypeTraversal(
@@ -1552,7 +1595,7 @@ void AutoMixedPrecisionImpl::AddClearAndGrayToWhiteIfBetweenWhite(
   absl::flat_hash_set<int> downstream_of_white_set;
   for (int root_idx = 0; root_idx < graph_type_view_.num_nodes(); ++root_idx) {
     const NodeTypeId& root = *graph_type_view_.GetNode(root_idx);
-    if (!ShouldProcess(*root.node) || !fp16_whitelist_.count(root.node->op())) {
+    if (!ShouldProcess(*root.node) || !f16_whitelist_.count(root.node->op())) {
       continue;
     }
     DfsTypeTraversal(
@@ -1561,14 +1604,14 @@ void AutoMixedPrecisionImpl::AddClearAndGrayToWhiteIfBetweenWhite(
           const NodeTypeId& item = *graph_type_view_.GetNode(idx);
           return idx == root_idx ||
                  (!downstream_of_white_set.count(idx) &&
-                  !fp16_whitelist_.count(item.node->op()) &&
+                  !f16_whitelist_.count(item.node->op()) &&
                   !black_set.count(idx) && ShouldProcess(*item.node) &&
                   // TODO(benbarsdell): Consider allowing propagation through
                   // ops that are already float16 in order to reduce the number
                   // of casts.
-                  IsFloat32(item) && SupportsFloat16(item) &&
-                  (fp16_clearlist_.count(item.node->op()) ||
-                   fp16_graylist_.count(item.node->op())));
+                  IsFloat32(item) && SupportsF16(item) &&
+                  (f16_clearlist_.count(item.node->op()) ||
+                   f16_graylist_.count(item.node->op())));
         }),
         DfsTypeCallbacks::PreOrder(
             [&](int idx) { downstream_of_white_set.insert(idx); }));
@@ -1579,7 +1622,7 @@ void AutoMixedPrecisionImpl::AddClearAndGrayToWhiteIfBetweenWhite(
   for (int root_idx = 0; root_idx < graph_type_view_.num_nodes(); ++root_idx) {
     const NodeTypeId& root = *graph_type_view_.GetNode(root_idx);
     if (!ShouldProcess(*root.node) || upstream_of_white_set.count(root_idx) ||
-        !fp16_whitelist_.count(root.node->op())) {
+        !f16_whitelist_.count(root.node->op())) {
       continue;
     }
     DfsTypeTraversal(
@@ -1620,8 +1663,8 @@ void AutoMixedPrecisionImpl::PropagateWhiteThroughClear(
           return idx == root_idx ||
                  (!white_set->count(idx) && !black_set.count(idx) &&
                   ShouldProcess(*item.node) && IsFloat32(item) &&
-                  SupportsFloat16(item) &&
-                  (fp16_clearlist_.count(item.node->op())) &&
+                  SupportsF16(item) &&
+                  (f16_clearlist_.count(item.node->op())) &&
                   // We don't propagate (backwards) through nodes that read
                   // Variables because it can break the behavior of TensorBoard
                   // visualization and/or (in the case of Enter nodes) the model
@@ -1806,13 +1849,13 @@ void AutoMixedPrecisionImpl::MakeCastsWhiteIfAllOutputsWhite(
   }
 }
 
-// Changes all white-painted type attributes to DT_HALF, and inserts Cast nodes
-// at node outputs for all edges that connect white-painted <->
-// non-white-painted type attributes.
+// Changes all white-painted type attributes to DT_HALF or DT_BFLOAT16, and
+// inserts Cast nodes at node outputs for all edges that connect
+// white-painted <-> non-white-painted type attributes.
 Status AutoMixedPrecisionImpl::ChangeTypeAttrsAndAddCasts(
     const absl::flat_hash_set<int>& white_set) {
   int num_nodes_changed = 0;
-  int num_nonvar_casts_to_fp16 = 0;
+  int num_nonvar_casts_to_f16 = 0;
   int num_nodes_preop = graph_->node_size();
   for (int node_idx = 0; node_idx < num_nodes_preop; ++node_idx) {
     NodeDef* node = graph_->mutable_node(node_idx);
@@ -1829,8 +1872,9 @@ Status AutoMixedPrecisionImpl::ChangeTypeAttrsAndAddCasts(
       bool src_is_white = white_set.count(node_type_idx);
       if (src_is_white) {
         VLOG(1) << "Changing type " << type_attr.DebugString() << " of "
-                << node->op() << " node " << node->name() << " to DT_HALF";
-        if (!SetDataType(node, type_attr, DT_HALF)) {
+                << node->op() << " node " << node->name() << " to "
+                << DataTypeString(target_dtype_);
+        if (!SetDataType(node, type_attr, target_dtype_)) {
           return errors::Internal("Failed to set type attribute");
         }
         ++num_nodes_changed;
@@ -1855,16 +1899,16 @@ Status AutoMixedPrecisionImpl::ChangeTypeAttrsAndAddCasts(
           bool dst_is_white = white_set.count(dst_type_idx);
           if (src_is_white != dst_is_white) {
             if (!added_cast_node) {
-              bool to_fp16 = dst_is_white;
+              bool to_f16 = dst_is_white;
               VLOG(1) << "Inserting cast to "
-                      << (to_fp16 ? "DT_HALF" : "DT_FLOAT") << " at "
-                      << src.node->op() << " " << src.node->name() << ":"
-                      << src.port_id;
+                      << (to_f16 ? DataTypeString(target_dtype_) : "DT_FLOAT")
+                      << " at " << src.node->op() << " " << src.node->name()
+                      << ":" << src.port_id;
               added_cast_node = graph_view_.AddNode(
-                  BuildCastNode(src, to_fp16, src.node->device()));
-              if (to_fp16 && !IsConstant(*node) && !IsVariable(*node) &&
+                  BuildCastNode(src, to_f16, src.node->device()));
+              if (to_f16 && !IsConstant(*node) && !IsVariable(*node) &&
                   !NodeImplicitlyReadsNonResourceVariable(*node)) {
-                ++num_nonvar_casts_to_fp16;
+                ++num_nonvar_casts_to_f16;
               }
             }
             TF_RETURN_IF_ERROR(graph_view_.UpdateRegularFaninByPort(
@@ -1874,9 +1918,13 @@ Status AutoMixedPrecisionImpl::ChangeTypeAttrsAndAddCasts(
       }
     }
   }
+  // Use Python type names (e.g. float16) instead of C++ type names (e.g. half)
+  // since many Python users will see this message.
+  const char* type_str = target_dtype_ == DT_HALF ? "float16" : "bfloat16";
   LOG(INFO) << "Converted " << num_nodes_changed << "/" << num_nodes_preop
-            << " nodes to float16 precision using " << num_nonvar_casts_to_fp16
-            << " cast(s) to float16 (excluding Const and Variable casts)";
+            << " nodes to " << type_str << " precision using "
+            << num_nonvar_casts_to_f16 << " cast(s) to " << type_str
+            << " (excluding Const and Variable casts)";
   return Status::OK();
 }
 
@@ -1902,12 +1950,24 @@ Status AutoMixedPrecision::Optimize(Cluster* cluster, const GrapplerItem& item,
     return errors::InvalidArgument("cluster == nullptr");
   }
 
+#if !defined(INTEL_MKL) || !defined(ENABLE_INTEL_MKL_BFLOAT16)
+  if (mode_ == AutoMixedPrecisionMode::MKL) {
+    return errors::Unimplemented(
+        "The auto_mixed_precision_mkl optimizer cannot be used since "
+        "this build of TensorFlow is not compiled with MKL support for "
+        "bfloat16. "
+        "For information on MKL builds, see: "
+        "https://software.intel.com/en-us/articles/intel-optimization-for-"
+        "tensorflow-installation-guide");
+  }
+#endif
+
   // Start by copying input graph to output.
   *output = item.graph;
 
   int num_gpus = ShouldIgnorePerformance() ? GetNumGPUs(*cluster)
                                            : GetNumGPUs(*cluster, kMinGPUArch);
-  if (num_gpus < 1) {
+  if (num_gpus < 1 && mode_ == AutoMixedPrecisionMode::CUDA) {
     // AutoMixedPrecision is currently only tuned for GPU.
     LOG(WARNING) << "No (suitable) GPUs detected, skipping " << name()
                  << " graph optimizer";
@@ -1916,7 +1976,7 @@ Status AutoMixedPrecision::Optimize(Cluster* cluster, const GrapplerItem& item,
 
   // Optimize the output graph in-place.
   AutoMixedPrecisionImpl optimizer(cluster, item.NodesToPreserve(), output,
-                                   item.id);
+                                   item.id, mode_);
   if (item.id == "tf_graph") {
     LOG(INFO) << "Running " << name() << " graph optimizer";
   } else {
diff --git a/tensorflow/core/grappler/optimizers/auto_mixed_precision.h b/tensorflow/core/grappler/optimizers/auto_mixed_precision.h
index 163d1f6923f..c41ba7d2821 100644
--- a/tensorflow/core/grappler/optimizers/auto_mixed_precision.h
+++ b/tensorflow/core/grappler/optimizers/auto_mixed_precision.h
@@ -22,16 +22,25 @@ limitations under the License.
 namespace tensorflow {
 namespace grappler {
 
-// Convert data types to float16 where appropriate to improve performance on
-// GPUs.
+enum class AutoMixedPrecisionMode { CUDA, MKL };
+
+// Convert data types to float16 or bfloat16 where appropriate to improve
+// performance on GPUs or CPUs.
 class AutoMixedPrecision : public GraphOptimizer {
  public:
+  // If 'mode' is CUDA, converts nodes to float16 on Nvidia GPUs. If MKL,
+  // converts nodes to bfloat16 on CPUs in order to take advantage of MKL
+  // performance improvements with bfloat16.
   explicit AutoMixedPrecision(
-      RewriterConfig::Toggle opt_level = RewriterConfig::ON) {}
+      AutoMixedPrecisionMode mode = AutoMixedPrecisionMode::CUDA)
+      : mode_(mode) {}
 
   ~AutoMixedPrecision() override {}
 
-  string name() const override { return "auto_mixed_precision"; };
+  string name() const override {
+    return mode_ == AutoMixedPrecisionMode::CUDA ? "auto_mixed_precision_cuda"
+                                                 : "auto_mixed_precision_mkl";
+  };
 
   bool UsesFunctionLibrary() const override { return false; }
 
@@ -40,6 +49,9 @@ class AutoMixedPrecision : public GraphOptimizer {
 
   void Feedback(Cluster* cluster, const GrapplerItem& item,
                 const GraphDef& optimize_output, double result) override;
+
+ private:
+  const AutoMixedPrecisionMode mode_;
 };
 
 }  // end namespace grappler
diff --git a/tensorflow/core/grappler/optimizers/auto_mixed_precision_lists.h b/tensorflow/core/grappler/optimizers/auto_mixed_precision_lists.h
index d3d13e2edc0..a9840110d81 100644
--- a/tensorflow/core/grappler/optimizers/auto_mixed_precision_lists.h
+++ b/tensorflow/core/grappler/optimizers/auto_mixed_precision_lists.h
@@ -23,10 +23,43 @@ limitations under the License.
 namespace tensorflow {
 namespace grappler {
 
+// Represents the four lists of ops: the white list, gray list, black list, and
+// clear list. These lists determine which ops are converted to fp16/bf16
+// (referred to as 'f16' for short) and which ops stay as fp32.
 class AutoMixedPrecisionLists {
- private:
-  static void UpdateList(gtl::FlatSet<string>* list, const string& to_add,
-                         const string& to_remove) {
+ public:
+  virtual ~AutoMixedPrecisionLists() {}
+
+  // Returns the set of ops that are considered numerically-safe (for execution
+  // in f16), performance-critical, and can run in f16. These ops are always
+  // converted to f16.
+  virtual gtl::FlatSet<string> WhiteList() = 0;
+  // Returns the set of ops that can run in f16 and are considered numerically-
+  // safe (for execution in f16), but which may be made unsafe by an upstream
+  // blacklist op.
+  virtual gtl::FlatSet<string> GrayList() = 0;
+  // Returns the set of ops that are considered numerically-dangerous (i.e.,
+  // unsafe for execution in f16) and whose effects may also be observed in
+  // downstream nodes (e.g. for f16, in Exp -> Add, the Add is unsafe due to
+  // the Exp).
+  virtual gtl::FlatSet<string> BlackList() = 0;
+  // Returns the set of ops that do not have numerically-significant effects
+  // (i.e., they are always considered safe for execution in f16 precision), and
+  // can run in f16.
+  virtual gtl::FlatSet<string> ClearList() = 0;
+
+ protected:
+  // Adds or removes ops from list if certain environmental variables are set.
+  static void UpdateList(const string& list_name, gtl::FlatSet<string>* list) {
+    CHECK(list_name == "WHITELIST" || list_name == "GRAYLIST" ||  // Crash OK.
+          list_name == "BLACKLIST" || list_name == "CLEARLIST");
+    string add_env_var =
+        "TF_AUTO_MIXED_PRECISION_GRAPH_REWRITE_" + list_name + "_ADD";
+    string remove_env_var =
+        "TF_AUTO_MIXED_PRECISION_GRAPH_REWRITE_" + list_name + "_REMOVE";
+    string to_add, to_remove;
+    TF_CHECK_OK(ReadStringFromEnvVar(add_env_var, "", &to_add));
+    TF_CHECK_OK(ReadStringFromEnvVar(remove_env_var, "", &to_remove));
     for (const auto& x : str_util::Split(to_add, ",")) {
       list->insert(x);
     }
@@ -35,6 +68,29 @@ class AutoMixedPrecisionLists {
     }
   }
 
+  // Subclasses should include these on the ClearList.
+  static void AddTensorListOps(gtl::FlatSet<string>* list) {
+    // Note: if a data structure op (such as TensorListPopBack) is added here,
+    // IsTensorListReaderOp or IsTensorListWriterOp may need to be modified
+    // LINT.IfChange
+    constexpr const char* tensor_list_ops[] = {
+        "TensorListConcat",     "TensorListConcatLists",
+        "TensorListConcatV2",   "TensorListGather",
+        "TensorListGetItem",    "TensorListPopBack",
+        "TensorListPushBack",   "TensorListPushBackBatch",
+        "TensorListFromTensor", "TensorListScatter",
+        "TensorListScatterV2",  "TensorListScatterIntoExistingList",
+        "TensorListSetItem",    "TensorListSplit",
+        "TensorListStack"};
+    // LINT.ThenChange(//tensorflow/core/grappler/optimizers/auto_mixed_precision.cc)
+    for (auto op : tensor_list_ops) {
+      list->insert(op);
+    }
+  }
+};
+
+class AutoMixedPrecisionListsCuda : public AutoMixedPrecisionLists {
+ private:
   static bool IsPseudoFastMath() {
     string optimization_level;
     TF_CHECK_OK(
@@ -45,16 +101,10 @@ class AutoMixedPrecisionLists {
   }
 
  public:
-  // Returns the set of ops that are considered numerically-safe (for execution
-  // in fp16) and performance-critical. These ops are always converted to fp16.
-  static gtl::FlatSet<string> WhiteList(int cuda_version, int cudnn_version) {
-    string to_add, to_remove;
-    TF_CHECK_OK(ReadStringFromEnvVar(
-        "TF_AUTO_MIXED_PRECISION_GRAPH_REWRITE_WHITELIST_ADD", "", &to_add));
-    TF_CHECK_OK(ReadStringFromEnvVar(
-        "TF_AUTO_MIXED_PRECISION_GRAPH_REWRITE_WHITELIST_REMOVE", "",
-        &to_remove));
+  AutoMixedPrecisionListsCuda(int cuda_version, int cudnn_version)
+      : cuda_version_(cuda_version), cudnn_version_(cudnn_version) {}
 
+  gtl::FlatSet<string> WhiteList() override {
     auto list = gtl::FlatSet<string>{
         "BlockLSTM",
         "BlockLSTMV2",
@@ -81,12 +131,12 @@ class AutoMixedPrecisionLists {
         // "DepthwiseConv2dNativeBackpropInput",
         "MatMul",
     };
-    if (cuda_version >= 9010) {
+    if (cuda_version_ >= 9010) {
       // Fp16 BatchMatMul is slow before CUDA 9.1.
       list.insert("BatchMatMul");
       list.insert("BatchMatMulV2");
     }
-    if (cudnn_version >= 7602) {
+    if (cudnn_version_ >= 7602) {
       // Fp16 3D conv is slow before CUDNN 7.6.2.
       list.insert("Conv3D");
       list.insert("Conv3DBackpropFilter");
@@ -94,22 +144,14 @@ class AutoMixedPrecisionLists {
       list.insert("Conv3DBackpropInput");
       list.insert("Conv3DBackpropInputV2");
     }
-    UpdateList(&list, to_add, to_remove);
+    UpdateList("WHITELIST", &list);
     return list;
   }
 
-  // Returns the set of ops that are considered numerically-safe (for execution
-  // in fp16), but which may be made unsafe by an upstream blacklist op.
-  static gtl::FlatSet<string> GrayList() {
+  gtl::FlatSet<string> GrayList() override {
     if (IsPseudoFastMath()) {
       return gtl::FlatSet<string>{};
     }
-    string to_add, to_remove;
-    TF_CHECK_OK(ReadStringFromEnvVar(
-        "TF_AUTO_MIXED_PRECISION_GRAPH_REWRITE_GRAYLIST_ADD", "", &to_add));
-    TF_CHECK_OK(ReadStringFromEnvVar(
-        "TF_AUTO_MIXED_PRECISION_GRAPH_REWRITE_GRAYLIST_REMOVE", "",
-        &to_remove));
 
     auto list = gtl::FlatSet<string>{
         "Add",
@@ -156,23 +198,14 @@ class AutoMixedPrecisionLists {
         "Tanh",
         "TanhGrad",
     };
-    UpdateList(&list, to_add, to_remove);
+    UpdateList("GRAYLIST", &list);
     return list;
   }
 
-  // Returns the set of ops that are considered numerically-dangerous (i.e.,
-  // unsafe for execution in fp16) and whose effects may also be observed in
-  // downstream nodes (e.g., in Exp -> Add, the Add is unsafe due to the Exp).
-  static gtl::FlatSet<string> BlackList() {
+  gtl::FlatSet<string> BlackList() override {
     if (IsPseudoFastMath()) {
       return gtl::FlatSet<string>{};
     }
-    string to_add, to_remove;
-    TF_CHECK_OK(ReadStringFromEnvVar(
-        "TF_AUTO_MIXED_PRECISION_GRAPH_REWRITE_BLACKLIST_ADD", "", &to_add));
-    TF_CHECK_OK(ReadStringFromEnvVar(
-        "TF_AUTO_MIXED_PRECISION_GRAPH_REWRITE_BLACKLIST_REMOVE", "",
-        &to_remove));
 
     auto list = gtl::FlatSet<string>{
         "Exp",
@@ -185,22 +218,14 @@ class AutoMixedPrecisionLists {
         "SparseSoftmaxCrossEntropyWithLogits",
         "Sum",
     };
-    UpdateList(&list, to_add, to_remove);
+    UpdateList("BLACKLIST", &list);
     return list;
   }
 
-  // Returns the set of ops that do not have numerically-significant effects
-  // (i.e., they are always considered safe for execution in fp16 precision).
-  static gtl::FlatSet<string> ClearList() {
+  gtl::FlatSet<string> ClearList() override {
     if (IsPseudoFastMath()) {
       return gtl::FlatSet<string>{};
     }
-    string to_add, to_remove;
-    TF_CHECK_OK(ReadStringFromEnvVar(
-        "TF_AUTO_MIXED_PRECISION_GRAPH_REWRITE_CLEARLIST_ADD", "", &to_add));
-    TF_CHECK_OK(ReadStringFromEnvVar(
-        "TF_AUTO_MIXED_PRECISION_GRAPH_REWRITE_CLEARLIST_REMOVE", "",
-        &to_remove));
 
     auto list = gtl::FlatSet<string>{
         "Abs",
@@ -291,21 +316,6 @@ class AutoMixedPrecisionLists {
         "StridedSlice",
         "StridedSliceGrad",
         "Switch",
-        "TensorListConcat",
-        "TensorListConcatLists",
-        "TensorListConcatV2",
-        "TensorListGather",
-        "TensorListGetItem",
-        "TensorListPopBack",
-        "TensorListPushBack",
-        "TensorListPushBackBatch",
-        "TensorListFromTensor",
-        "TensorListScatter",
-        "TensorListScatterV2",
-        "TensorListScatterIntoExistingList",
-        "TensorListSetItem",
-        "TensorListSplit",
-        "TensorListStack",
         "Tile",
         "TopK",
         "TopKV2",
@@ -313,7 +323,96 @@ class AutoMixedPrecisionLists {
         "Where",
         "ZerosLike",
     };
-    UpdateList(&list, to_add, to_remove);
+    AddTensorListOps(&list);
+    UpdateList("CLEARLIST", &list);
+    return list;
+  }
+
+ private:
+  int cuda_version_;
+  int cudnn_version_;
+};
+
+class AutoMixedPrecisionListsMkl : public AutoMixedPrecisionLists {
+ public:
+  AutoMixedPrecisionListsMkl() {}
+
+  // Only ops which are supported by MKL in bfloat16 should be added to the
+  // white list, gray list, or clear list.
+  gtl::FlatSet<string> WhiteList() override {
+    auto list = gtl::FlatSet<string>{"Conv2D",
+                                     "Conv2DBackpropFilter",
+                                     "Conv2DBackpropInput",
+                                     "Conv3D",
+                                     "Conv3DBackpropFilterV2",
+                                     "Conv3DBackpropInputV2",
+                                     "DepthwiseConv2dNative",
+                                     "DepthwiseConv2dNativeBackpropFilter",
+                                     "DepthwiseConv2dNativeBackpropInput",
+                                     "MatMul",
+                                     "BatchMatMul",
+                                     "BatchMatMulV2"};
+
+    UpdateList("WHITELIST", &list);
+    return list;
+  }
+
+  gtl::FlatSet<string> GrayList() override {
+    auto list = gtl::FlatSet<string>{
+        "Add",
+        "AddN",
+        "AddV2",
+        "AvgPool",
+        "AvgPool3D",
+        "AvgPool3DGrad",
+        "AvgPoolGrad",
+        "BiasAdd",
+        "BiasAddGrad",
+        "BiasAddV1",
+        "FusedBatchNormV2",
+        "FusedBatchNormGradV2",
+        "FusedBatchNormV3",
+        "FusedBatchNormGradV3",
+        "LeakyRelu",
+        "LeakyReluGrad",
+        "Mul",
+        "Sub",
+    };
+    UpdateList("GRAYLIST", &list);
+    return list;
+  }
+
+  gtl::FlatSet<string> BlackList() override {
+    auto list = gtl::FlatSet<string>{
+        "Exp",
+        "Expm1",
+        "L2Loss",
+        "Mean",
+        "Pow",
+        "SaveV2",
+        "Softmax",
+        "SoftmaxCrossEntropyWithLogits",
+        "SparseSoftmaxCrossEntropyWithLogits",
+        "Sum",
+    };
+    UpdateList("BLACKLIST", &list);
+    return list;
+  }
+
+  gtl::FlatSet<string> ClearList() override {
+    auto list = gtl::FlatSet<string>{
+        "Concat",          "ConcatV2",  "Enter",         "EnsureShape",
+        "Equal",           "Exit",      "ExpandDims",    "Identity",
+        "MaxPool",         "MaxPool3D", "MaxPool3DGrad", "MaxPoolGrad",
+        "MaxPoolV2",       "Maximum",   "Merge",         "NextIteration",
+        "PreventGradient", "Relu",      "Relu6",         "Relu6Grad",
+        "ReluGrad",        "Reshape",   "Select",        "SelectV2",
+        "Shape",           "ShapeN",    "Slice",         "Split",
+        "SplitV",          "Squeeze",   "StopGradient",  "Switch",
+        "Transpose",       "ZerosLike",
+    };
+    AddTensorListOps(&list);
+    UpdateList("CLEARLIST", &list);
     return list;
   }
 };
diff --git a/tensorflow/core/grappler/optimizers/auto_mixed_precision_test.cc b/tensorflow/core/grappler/optimizers/auto_mixed_precision_test.cc
index 951279d37cd..5c18966c895 100644
--- a/tensorflow/core/grappler/optimizers/auto_mixed_precision_test.cc
+++ b/tensorflow/core/grappler/optimizers/auto_mixed_precision_test.cc
@@ -13,11 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-// Currently, this test only passes when TensorFlow passes with CUDA, because
-// otherwise the optimizer will not turn clearlist nodes to float16. When
-// looking at clearlist nodes, this optimizer checks if the nodes have a float16
-// GPU OpKernel, but without CUDA there are no GPU OpKernels at all.
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM || \
+    (INTEL_MKL && defined(ENABLE_INTEL_MKL_BFLOAT16))
 
 #include "tensorflow/core/grappler/optimizers/auto_mixed_precision.h"
 
@@ -70,6 +67,31 @@ Tensor GenerateRandomTensorInRange(const TensorShape& shape, double minval,
   return tensor;
 }
 
+void VerifyGraphsEquivalent(const GraphDef& original_graph,
+                            const GraphDef& optimized_graph,
+                            const string& func) {
+  EXPECT_EQ(original_graph.node_size(), optimized_graph.node_size()) << func;
+  GraphView optimized_view(&optimized_graph);
+  for (int i = 0; i < original_graph.node_size(); ++i) {
+    const NodeDef& original = original_graph.node(i);
+    const NodeDef& optimized = *optimized_view.GetNode(original.name());
+    EXPECT_EQ(original.name(), optimized.name()) << func;
+    EXPECT_EQ(original.op(), optimized.op()) << func;
+    EXPECT_EQ(original.input_size(), optimized.input_size()) << func;
+    if (original.input_size() == optimized.input_size()) {
+      for (int j = 0; j < original.input_size(); ++j) {
+        EXPECT_EQ(original.input(j), optimized.input(j)) << func;
+      }
+    }
+  }
+}
+
+// Currently, this test suite only passes when TensorFlow passes with CUDA,
+// because otherwise the optimizer will not turn clearlist nodes to float16.
+// When looking at clearlist nodes, this optimizer checks if the nodes have a
+// float16 GPU OpKernel, but without CUDA there are no GPU OpKernels at all.
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
 const std::pair<int, int> kMinGPUArch = {7, 0};
 
 class AutoMixedPrecisionTest : public GrapplerTest {
@@ -184,25 +206,6 @@ class AutoMixedPrecisionTest : public GrapplerTest {
   bool gpu_available_;
 };
 
-void VerifyGraphsEquivalent(const GraphDef& original_graph,
-                            const GraphDef& optimized_graph,
-                            const string& func) {
-  EXPECT_EQ(original_graph.node_size(), optimized_graph.node_size()) << func;
-  GraphView optimized_view(&optimized_graph);
-  for (int i = 0; i < original_graph.node_size(); ++i) {
-    const NodeDef& original = original_graph.node(i);
-    const NodeDef& optimized = *optimized_view.GetNode(original.name());
-    EXPECT_EQ(original.name(), optimized.name()) << func;
-    EXPECT_EQ(original.op(), optimized.op()) << func;
-    EXPECT_EQ(original.input_size(), optimized.input_size()) << func;
-    if (original.input_size() == optimized.input_size()) {
-      for (int j = 0; j < original.input_size(); ++j) {
-        EXPECT_EQ(original.input(j), optimized.input(j)) << func;
-      }
-    }
-  }
-}
-
 TEST_F(AutoMixedPrecisionTest, NoOp) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   Output input = ops::Const(s.WithOpName("input"), 1.234f, {32});
@@ -287,10 +290,10 @@ TEST_F(AutoMixedPrecisionTest, Simple) {
   Output clr2 = ops::Relu(s.WithOpName("clr2"), gry1);
   Output wht1 = ops::MatMul(s.WithOpName("wht1"), clr2, clr2);
   Output clr3 = ops::Relu(s.WithOpName("clr3"), wht1);
-  Output blk2 = ops::Log(s.WithOpName("blk2"), clr3);
-  Output clr4 = ops::Relu(s.WithOpName("clr4"), blk2);
-  Output blk3 = ops::SparseMatMul(s.WithOpName("blk3"), clr4, clr4);
-  Output clr5 = ops::Relu(s.WithOpName("clr5"), blk3);
+  Output gry2 = ops::Log(s.WithOpName("gry2"), clr3);
+  Output clr4 = ops::Relu(s.WithOpName("clr4"), gry2);
+  Output blk2 = ops::SparseMatMul(s.WithOpName("blk2"), clr4, clr4);
+  Output clr5 = ops::Relu(s.WithOpName("clr5"), blk2);
   Output fetch = ops::Identity(s.WithOpName("fetch"), clr5);
 
   GrapplerItem item;
@@ -313,10 +316,10 @@ TEST_F(AutoMixedPrecisionTest, Simple) {
   EXPECT_EQ(output_view.GetNode("clr2")->attr().at("T").type(), DT_HALF);
   EXPECT_EQ(output_view.GetNode("wht1")->attr().at("T").type(), DT_HALF);
   EXPECT_EQ(output_view.GetNode("clr3")->attr().at("T").type(), DT_HALF);
-  EXPECT_EQ(output_view.GetNode("blk2")->attr().at("T").type(), DT_FLOAT);
+  EXPECT_EQ(output_view.GetNode("gry2")->attr().at("T").type(), DT_FLOAT);
   EXPECT_EQ(output_view.GetNode("clr4")->attr().at("T").type(), DT_FLOAT);
-  EXPECT_EQ(output_view.GetNode("blk3")->attr().at("Ta").type(), DT_FLOAT);
-  EXPECT_EQ(output_view.GetNode("blk3")->attr().at("Tb").type(), DT_FLOAT);
+  EXPECT_EQ(output_view.GetNode("blk2")->attr().at("Ta").type(), DT_FLOAT);
+  EXPECT_EQ(output_view.GetNode("blk2")->attr().at("Tb").type(), DT_FLOAT);
   EXPECT_EQ(output_view.GetNode("clr5")->attr().at("T").type(), DT_FLOAT);
 
   auto tensors = EvaluateNodes(output, item.fetch);
@@ -1164,8 +1167,191 @@ TEST_F(AutoMixedPrecisionTest, TanhOp) {
       });
 }
 
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+#if INTEL_MKL
+#ifdef ENABLE_INTEL_MKL_BFLOAT16
+
+class AutoMixedPrecisionMklTest : public GrapplerTest {
+ protected:
+  void SetUp() override {
+    virtual_cluster_.reset(new SingleMachine(/* timeout_s = */ 10, 1, 0));
+    TF_CHECK_OK(virtual_cluster_->Provision());
+  }
+  void TearDown() override { TF_CHECK_OK(virtual_cluster_->Shutdown()); }
+
+  std::unique_ptr<Cluster> virtual_cluster_;
+};
+
+TEST_F(AutoMixedPrecisionMklTest, AlreadyBf16) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output input = ops::Const(s.WithOpName("input"), 1.f, {32, 32});
+  Output cst1 = ops::Cast(s.WithOpName("cst1"), input, DT_BFLOAT16);
+  Output wht1 = ops::MatMul(s.WithOpName("wht1"), cst1, cst1);
+  Output clr1 = ops::Relu(s.WithOpName("clr1"), wht1);
+  Output cst2 = ops::Cast(s.WithOpName("cst2"), clr1, DT_FLOAT);
+  Output clr2 = ops::Relu(s.WithOpName("clr2"), cst2);
+  Output fetch = ops::Identity(s.WithOpName("fetch"), clr2);
+
+  GrapplerItem item;
+  item.fetch = {"fetch"};
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
+
+  AutoMixedPrecision optimizer{AutoMixedPrecisionMode::MKL};
+  GraphDef output;
+  TF_ASSERT_OK(optimizer.Optimize(virtual_cluster_.get(), item, &output));
+  VLOG(1) << output.DebugString();
+
+  VerifyGraphsEquivalent(item.graph, output, __FUNCTION__);
+  GraphView output_view(&output);
+  EXPECT_EQ(output_view.GetNode("input")->attr().at("dtype").type(), DT_FLOAT);
+  EXPECT_EQ(output_view.GetNode("cst1")->attr().at("DstT").type(), DT_BFLOAT16);
+  EXPECT_EQ(output_view.GetNode("wht1")->attr().at("T").type(), DT_BFLOAT16);
+  EXPECT_EQ(output_view.GetNode("clr1")->attr().at("T").type(), DT_BFLOAT16);
+  EXPECT_EQ(output_view.GetNode("cst2")->attr().at("SrcT").type(), DT_BFLOAT16);
+  EXPECT_EQ(output_view.GetNode("cst2")->attr().at("DstT").type(), DT_FLOAT);
+  EXPECT_EQ(output_view.GetNode("clr2")->attr().at("T").type(), DT_FLOAT);
+
+  auto tensors = EvaluateNodes(output, item.fetch);
+  EXPECT_EQ(tensors.size(), tensors_expected.size());
+  EXPECT_EQ(tensors.size(), item.fetch.size());
+  for (int i = 0; i < item.fetch.size(); ++i) {
+    test::ExpectTensorNear<float>(tensors_expected[i], tensors[i], 1e-6);
+  }
+}
+
+TEST_F(AutoMixedPrecisionMklTest, Simple) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output input = ops::Const(s.WithOpName("input"), 1.f / 32, {32, 32});
+  Output blk1 = ops::Exp(s.WithOpName("blk1"), input);
+  Output clr1 = ops::Relu(s.WithOpName("clr1"), blk1);
+  Output gry1 = ops::Sqrt(s.WithOpName("gry1"), clr1);
+  Output clr2 = ops::Relu(s.WithOpName("clr2"), gry1);
+  Output wht1 = ops::MatMul(s.WithOpName("wht1"), clr2, clr2);
+  Output clr3 = ops::Relu(s.WithOpName("clr3"), wht1);
+  Output blk2 = ops::Log(s.WithOpName("blk2"), clr3);
+  Output clr4 = ops::Relu(s.WithOpName("clr4"), blk2);
+  Output blk3 = ops::SparseMatMul(s.WithOpName("blk3"), clr4, clr4);
+  Output clr5 = ops::Relu(s.WithOpName("clr5"), blk3);
+  Output fetch = ops::Identity(s.WithOpName("fetch"), clr5);
+
+  GrapplerItem item;
+  item.fetch = {"fetch"};
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
+
+  AutoMixedPrecision optimizer{AutoMixedPrecisionMode::MKL};
+  GraphDef output;
+  TF_ASSERT_OK(optimizer.Optimize(virtual_cluster_.get(), item, &output));
+
+  VLOG(1) << output.DebugString();
+
+  GraphView output_view(&output);
+  EXPECT_EQ(output.node_size(), item.graph.node_size() + 2);
+  EXPECT_EQ(output_view.GetNode("input")->attr().at("dtype").type(), DT_FLOAT);
+  EXPECT_EQ(output_view.GetNode("blk1")->attr().at("T").type(), DT_FLOAT);
+  EXPECT_EQ(output_view.GetNode("clr1")->attr().at("T").type(), DT_FLOAT);
+  EXPECT_EQ(output_view.GetNode("gry1")->attr().at("T").type(), DT_FLOAT);
+  EXPECT_EQ(output_view.GetNode("clr2")->attr().at("T").type(), DT_BFLOAT16);
+  EXPECT_EQ(output_view.GetNode("wht1")->attr().at("T").type(), DT_BFLOAT16);
+  EXPECT_EQ(output_view.GetNode("clr3")->attr().at("T").type(), DT_BFLOAT16);
+  EXPECT_EQ(output_view.GetNode("blk2")->attr().at("T").type(), DT_FLOAT);
+  EXPECT_EQ(output_view.GetNode("clr4")->attr().at("T").type(), DT_FLOAT);
+  EXPECT_EQ(output_view.GetNode("blk3")->attr().at("Ta").type(), DT_FLOAT);
+  EXPECT_EQ(output_view.GetNode("blk3")->attr().at("Tb").type(), DT_FLOAT);
+  EXPECT_EQ(output_view.GetNode("clr5")->attr().at("T").type(), DT_FLOAT);
+
+  auto tensors = EvaluateNodes(output, item.fetch);
+  EXPECT_EQ(tensors.size(), tensors_expected.size());
+  EXPECT_EQ(tensors.size(), item.fetch.size());
+  for (int i = 0; i < item.fetch.size(); ++i) {
+    test::ExpectClose(tensors_expected[i], tensors[i], -1, 5e-4);
+  }
+}
+
+TEST_F(AutoMixedPrecisionMklTest, TensorListSetGet) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  tensorflow::Input shape = {32, 32};
+  auto tl1 = ops::TensorListReserve(s.WithOpName("tl1"), {32, 32}, 8, DT_FLOAT);
+  Output input = ops::Const(s.WithOpName("input"), 1.f / 32, {32, 32});
+  Output idx1 = ops::Const(s.WithOpName("idx1"), 1);
+  Output idx2 = ops::Const(s.WithOpName("idx2"), 2);
+  Output idx3 = ops::Const(s.WithOpName("idx3"), 3);
+  auto tl1w1 =
+      ops::TensorListSetItem(s.WithOpName("tl1w1"), tl1.handle, idx1, input);
+  Output wht1 = ops::MatMul(s.WithOpName("wht1"), input, input);
+  auto tl1w2 =
+      ops::TensorListSetItem(s.WithOpName("tl1w2"), tl1.handle, idx2, wht1);
+  // Ensure that TensorListResize doesn't cause any problems.
+  Output tl1rs =
+      ops::TensorListResize(s.WithOpName("tl1rs"), tl1w2.output_handle, 6);
+  Output tl1r1 = ops::TensorListGetItem(s.WithOpName("tl1r1"), tl1rs, idx2,
+                                        shape, DT_FLOAT)
+                     .item;
+  Output gry1 = ops::Mul(s.WithOpName("gry1"), tl1r1, tl1r1);
+  Output wht2 = ops::MatMul(s.WithOpName("wht2"), gry1, gry1);
+  auto tl1w3 =
+      ops::TensorListSetItem(s.WithOpName("tl1w3"), tl1.handle, idx3, wht2);
+  Output tl1r2 =
+      ops::TensorListGetItem(s.WithOpName("tl1r2"), tl1w3.output_handle, idx3,
+                             shape, DT_FLOAT)
+          .item;
+  auto tl2 = ops::TensorListReserve(s.WithOpName("tl2"), shape, 8, DT_FLOAT);
+  auto tl2w1 =
+      ops::TensorListSetItem(s.WithOpName("tl2w1"), tl2.handle, idx1, input);
+  Output tl2r1 =
+      ops::TensorListGetItem(s.WithOpName("tl2r1"), tl2w1.output_handle, idx1,
+                             shape, DT_FLOAT)
+          .item;
+  Output fetch1 = ops::Identity(s.WithOpName("fetch1"), tl1r2);
+  Output fetch2 = ops::Identity(s.WithOpName("fetch2"), tl2r1);
+
+  GrapplerItem item;
+  item.fetch = {"fetch1", "fetch2"};
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
+
+  AutoMixedPrecision optimizer{AutoMixedPrecisionMode::MKL};
+  GraphDef output;
+  TF_ASSERT_OK(optimizer.Optimize(virtual_cluster_.get(), item, &output));
+
+  VLOG(1) << output.DebugString();
+
+  GraphView output_view(&output);
+  EXPECT_EQ(output.node_size(), item.graph.node_size() + 2);
+  const char* type_key = "element_dtype";
+  EXPECT_EQ(output_view.GetNode("tl1")->attr().at(type_key).type(),
+            DT_BFLOAT16);
+  EXPECT_EQ(output_view.GetNode("tl1w1")->attr().at(type_key).type(),
+            DT_BFLOAT16);
+  EXPECT_EQ(output_view.GetNode("wht1")->attr().at("T").type(), DT_BFLOAT16);
+  EXPECT_EQ(output_view.GetNode("tl1w2")->attr().at(type_key).type(),
+            DT_BFLOAT16);
+  EXPECT_EQ(output_view.GetNode("tl1r1")->attr().at(type_key).type(),
+            DT_BFLOAT16);
+  EXPECT_EQ(output_view.GetNode("gry1")->attr().at("T").type(), DT_BFLOAT16);
+  EXPECT_EQ(output_view.GetNode("wht2")->attr().at("T").type(), DT_BFLOAT16);
+  EXPECT_EQ(output_view.GetNode("tl1w3")->attr().at(type_key).type(),
+            DT_BFLOAT16);
+  EXPECT_EQ(output_view.GetNode("tl2")->attr().at(type_key).type(), DT_FLOAT);
+  EXPECT_EQ(output_view.GetNode("tl2w1")->attr().at(type_key).type(), DT_FLOAT);
+  EXPECT_EQ(output_view.GetNode("tl2r1")->attr().at(type_key).type(), DT_FLOAT);
+
+  auto tensors = EvaluateNodes(output, item.fetch);
+  EXPECT_EQ(tensors.size(), tensors_expected.size());
+  EXPECT_EQ(tensors.size(), item.fetch.size());
+  for (int i = 0; i < item.fetch.size(); ++i) {
+    test::ExpectClose(tensors_expected[i], tensors[i], -1, 1e-2);
+  }
+}
+
+#endif  // ENABLE_INTEL_MKL_BFLOAT16
+#endif  // INTEL_MKL
+
 }  // namespace
 }  // namespace grappler
 }  // namespace tensorflow
 
-#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM || (INTEL_MKL &&
+        // defined(ENABLE_INTEL_MKL_BFLOAT16))
diff --git a/tensorflow/core/grappler/optimizers/common_subgraph_elimination.cc b/tensorflow/core/grappler/optimizers/common_subgraph_elimination.cc
index af323e913a7..2489cf93e78 100644
--- a/tensorflow/core/grappler/optimizers/common_subgraph_elimination.cc
+++ b/tensorflow/core/grappler/optimizers/common_subgraph_elimination.cc
@@ -73,7 +73,8 @@ class UniqueNodes {
     if (it == memoized_signatures_.end()) return;
 
     std::vector<NodeDef*>& candidates = rep_[it->second];
-    for (int i = 0; i < candidates.size(); ++i) {
+    for (int i = 0, candidates_size = candidates.size(); i < candidates_size;
+         ++i) {
       if (candidates[i] == node) {
         std::swap(candidates[i], candidates[candidates.size() - 1]);
         candidates.resize(candidates.size() - 1);
diff --git a/tensorflow/core/grappler/optimizers/debug_stripper.cc b/tensorflow/core/grappler/optimizers/debug_stripper.cc
index d4b3bf395c3..95c8f816338 100644
--- a/tensorflow/core/grappler/optimizers/debug_stripper.cc
+++ b/tensorflow/core/grappler/optimizers/debug_stripper.cc
@@ -63,7 +63,8 @@ Status DebugStripper::Optimize(Cluster* cluster, const GrapplerItem& item,
       node.mutable_attr()->swap(new_attr);
       // As Identity op only takes one input, mark redundant inputs as control
       // input.
-      for (size_t i = 1; i < node.input_size(); ++i) {
+      for (int i = 1, node_input_size = node.input_size(); i < node_input_size;
+           ++i) {
         if (!IsControlInput(node.input(i))) {
           *node.mutable_input(i) = AsControlDependency(NodeName(node.input(i)));
         }
diff --git a/tensorflow/core/grappler/optimizers/function_optimizer.cc b/tensorflow/core/grappler/optimizers/function_optimizer.cc
index ed3af955c13..0e156aaa84c 100644
--- a/tensorflow/core/grappler/optimizers/function_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/function_optimizer.cc
@@ -438,8 +438,8 @@ bool HasUnusedOutputs(const NodeDef& func_node, const FunctionDef& func,
   int num_outputs = func.signature().output_arg_size();
   const absl::flat_hash_set<int> active_outputs =
       GetActiveOutputs(func_node, ctx, /*size_hind*/ num_outputs);
-
-  return active_outputs.size() != num_outputs;
+  int active_outputs_size = active_outputs.size();
+  return active_outputs_size != num_outputs;
 }
 
 // Return pruned FunctionDefLibrary with functions that are reachable from
@@ -563,7 +563,8 @@ void RemoveUnusedOutputsTypes(const FunctionSpecialization& specialization,
   if (tout == nullptr || !tout->has_list()) return;
 
   // Nothing to do if all outputs are active.
-  if (specialization.active_outputs.size() == tout->list().type_size()) return;
+  int specialization_active_outputs_size = specialization.active_outputs.size();
+  if (specialization_active_outputs_size == tout->list().type_size()) return;
 
   // Clear input types for the specialized node.
   auto* attr = specialized_func_node->mutable_attr();
@@ -836,7 +837,6 @@ const bool IsExemptFromSideEffectsExecutionValidation(const string& op) {
        "ParameterizedTruncatedNormal", "TruncatedNormal", "RandomShuffle",
        "Multinomial", "RandomGamma", "RandomGammaGrad", "RandomPoisson",
        "RandomPoissonV2",
-       // LINT.ThenChange(//tensorflow/python/framework/auto_control_deps.py)
 
        // ReadVariableOp marked as stateful because it consumes DT_RESOURCE,
        // but it can't generate any observable side-effect.
@@ -850,7 +850,12 @@ const bool IsExemptFromSideEffectsExecutionValidation(const string& op) {
        // the same device_ordinal on the same host.
        "EnqueueTPUEmbeddingSparseBatch", "EnqueueTPUEmbeddingIntegerBatch",
        "EnqueueTPUEmbeddingSparseTensorBatch",
-       "EnqueueTPUEmbeddingRaggedTensorBatch"});
+       "EnqueueTPUEmbeddingRaggedTensorBatch",
+
+       // SaveV2 and RestoreV2 should be allowed to operate in parallel on
+       // multiple hosts.
+       "SaveV2", "RestoreV2"});
+  // LINT.ThenChange(//tensorflow/python/framework/auto_control_deps.py)
   return exemption->contains(op);
 }
 
@@ -1142,7 +1147,8 @@ void AddFrameForwardingControlEdge(const std::vector<ControlFlowInfo>& info,
                                    Node* caller, Graph* g) {
   // All nodes added to the graph by v2 control flow lowering and function
   // inlining are guaranteed to have control edges to nested function calls.
-  if (caller->id() >= info.size()) return;
+  int info_size = info.size();
+  if (caller->id() >= info_size) return;
 
   // Check if a lowered node is executing inside a while loop.
   const Node* frame = info[caller->id()].frame;
diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer.cc b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
index cd0d44e8e12..2f1c869965d 100644
--- a/tensorflow/core/grappler/optimizers/meta_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
@@ -188,7 +188,9 @@ std::unique_ptr<GraphOptimizer> MetaOptimizer::MakeNewOptimizer(
   MK_OPT("remap", new Remapper(cfg_.remapping()));
   MK_OPT("layout", new GenericLayoutOptimizer());
   MK_OPT("auto_mixed_precision",
-         new AutoMixedPrecision(cfg_.auto_mixed_precision()));
+         new AutoMixedPrecision(AutoMixedPrecisionMode::CUDA));
+  MK_OPT("auto_mixed_precision_mkl",
+         new AutoMixedPrecision(AutoMixedPrecisionMode::MKL));
   MK_OPT("memory", new MemoryOptimizer(RewriterConfig::MANUAL));
   MK_OPT("common_subgraph_elimination",
          new CommonSubgraphElimination(cfg_.common_subgraph_elimination()));
@@ -249,7 +251,11 @@ Status MetaOptimizer::InitializeOptimizers(
   }
   if (AutoMixedPrecisionEnabled(cfg_.auto_mixed_precision())) {
     optimizers->push_back(
-        MakeUnique<AutoMixedPrecision>(cfg_.auto_mixed_precision()));
+        MakeUnique<AutoMixedPrecision>(AutoMixedPrecisionMode::CUDA));
+  }
+  if (AutoMixedPrecisionEnabled(cfg_.auto_mixed_precision_mkl())) {
+    optimizers->push_back(
+        MakeUnique<AutoMixedPrecision>(AutoMixedPrecisionMode::MKL));
   }
   if (cfg_.pin_to_host_optimization() == RewriterConfig::ON) {
     optimizers->push_back(MakeUnique<PinToHostOptimizer>());
@@ -835,6 +841,7 @@ bool MetaOptimizerEnabled(const ConfigProto& cfg) {
          rewrite_cfg.scoped_allocator_optimization() == RewriterConfig::ON ||
          rewrite_cfg.pin_to_host_optimization() == RewriterConfig::ON ||
          AutoMixedPrecisionEnabled(rewrite_cfg.auto_mixed_precision()) ||
+         AutoMixedPrecisionEnabled(rewrite_cfg.auto_mixed_precision_mkl()) ||
          !rewrite_cfg.optimizers().empty() ||
          !rewrite_cfg.custom_optimizers().empty();
 }
diff --git a/tensorflow/core/grappler/optimizers/model_pruner.cc b/tensorflow/core/grappler/optimizers/model_pruner.cc
index 20db4360f73..3f5e3a8ea3a 100644
--- a/tensorflow/core/grappler/optimizers/model_pruner.cc
+++ b/tensorflow/core/grappler/optimizers/model_pruner.cc
@@ -401,9 +401,10 @@ Status SplitIdentityNInputs(GraphDef* graph,
     }
 
     const int num_non_control_inputs = NumNonControlInputs(*node);
+    int terminal_second_size = terminal.second.size();
     if (node->attr().count("T") == 0 ||
         node->attr().at("T").list().type_size() != num_non_control_inputs ||
-        terminal.second.size() >= num_non_control_inputs) {
+        terminal_second_size >= num_non_control_inputs) {
       continue;
     }
 
diff --git a/tensorflow/core/grappler/optimizers/pin_to_host_optimizer.cc b/tensorflow/core/grappler/optimizers/pin_to_host_optimizer.cc
index ec16de1294b..35d0c5b0e40 100644
--- a/tensorflow/core/grappler/optimizers/pin_to_host_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/pin_to_host_optimizer.cc
@@ -107,7 +107,8 @@ Status IsNodeOutputPortHostFriendly(const GraphView& graph,
         /*include_tensor_values=*/false));
   }
   const auto& output_properties = properties->GetOutputProperties(node.name());
-  if (port_id >= output_properties.size()) {
+  int output_properties_size = output_properties.size();
+  if (port_id >= output_properties_size) {
     LOG(WARNING) << "port_id=" << port_id
                  << " but output_properties.size()=" << output_properties.size()
                  << "\n"
diff --git a/tensorflow/core/grappler/optimizers/remapper.cc b/tensorflow/core/grappler/optimizers/remapper.cc
index 9a7d1953105..44e6174970e 100644
--- a/tensorflow/core/grappler/optimizers/remapper.cc
+++ b/tensorflow/core/grappler/optimizers/remapper.cc
@@ -1593,7 +1593,8 @@ bool IsConv2DWithAdd(const RemapperContext& ctx, int node_index) {
 // shapes:
 //   (1) Splitting FusedBatchNorm into primitives.
 //   (2) Fusing side input and/or activation into FusedBatchNorm.
-//   (3) INTEL_MKL specific: Conv2D -> Add or Conv2D -> BiasAdd -> Add.
+//   (3) Fusing Conv2D biasadd and relu on GPU
+//   (4) INTEL_MKL specific: Conv2D -> Add or Conv2D -> BiasAdd -> Add.
 bool RequiresInferredShapes(const RemapperContext& ctx, int node_index) {
   // Candidate for a FusedBatchNorm splitting.
   const auto* node_view = ctx.graph_view.GetNode(node_index);
@@ -1609,6 +1610,31 @@ bool RequiresInferredShapes(const RemapperContext& ctx, int node_index) {
     return true;
   };
 
+  const auto is_relu_biasadd_conv2d_candidate = [&]() -> bool {
+    if (!IsRelu(*node_def)) return false;
+    if (GetDataTypeFromAttr(*node_def, "T") != DT_FLOAT) return false;
+
+    if (node_view->NumRegularFanins() < 1) return false;
+    const auto& relu_fanin_0 = node_view->GetRegularFanin(0);
+    const auto* relu_fanin_0_node_view = relu_fanin_0.node_view();
+    const auto* relu_fanin_0_node_def = relu_fanin_0_node_view->node();
+
+    if (!IsBiasAdd(*relu_fanin_0_node_def)) return false;
+    if (GetDataTypeFromAttr(*relu_fanin_0_node_def, "T") != DT_FLOAT)
+      return false;
+
+    if (relu_fanin_0_node_view->NumRegularFanins() < 1) return false;
+
+    const auto& biasadd_fanin_0 = relu_fanin_0_node_view->GetRegularFanin(0);
+    const auto* biasadd_fanin_0_node_def = biasadd_fanin_0.node_view()->node();
+
+    if (!IsConv2D(*biasadd_fanin_0_node_def)) return false;
+    if (GetDataTypeFromAttr(*biasadd_fanin_0_node_def, "T") != DT_FLOAT)
+      return false;
+
+    return true;
+  };
+
   // Candidate for a FusedBatchNorm fusion.
   const auto is_batch_norm_fusion_candidate = [&]() -> bool {
     if (!IsRelu(*node_def)) return false;
@@ -1643,7 +1669,8 @@ bool RequiresInferredShapes(const RemapperContext& ctx, int node_index) {
   return is_batch_norm_candidate() || is_batch_norm_fusion_candidate() ||
          IsConv2DWithAdd(ctx, node_index);
 #else
-  return is_batch_norm_candidate() || is_batch_norm_fusion_candidate();
+  return is_relu_biasadd_conv2d_candidate() || is_batch_norm_candidate() ||
+         is_batch_norm_fusion_candidate();
 #endif  // INTEL_MKL
 }
 
@@ -1713,6 +1740,17 @@ Status Remapper::Optimize(Cluster* cluster, const GrapplerItem& item,
     }
 #endif  //! INTEL_MKL
 
+    // Infer properties lazily in case they are not needed.
+    if (!ctx.inferred_graph_properties && RequiresInferredShapes(ctx, i)) {
+      const bool assume_valid_feeds = opt_level_ == RewriterConfig::AGGRESSIVE;
+      TF_RETURN_IF_ERROR(ctx.graph_properties.InferStatically(
+          assume_valid_feeds,
+          /*aggressive_shape_inference=*/false,
+          /*include_input_tensor_values=*/true,
+          /*include_output_tensor_values=*/false));
+      ctx.inferred_graph_properties = true;
+    }
+
     // Remap {Conv2D,DepthwiseConv2D,MatMul}+BiasAdd into the
     // _Fused{Conv2D,DepthwiseConv2dNative,MatMul}
     ContractionWithBiasAdd contract_with_bias;
diff --git a/tensorflow/core/grappler/optimizers/remapper_test.cc b/tensorflow/core/grappler/optimizers/remapper_test.cc
index 35e09b28205..3aba1a590ce 100644
--- a/tensorflow/core/grappler/optimizers/remapper_test.cc
+++ b/tensorflow/core/grappler/optimizers/remapper_test.cc
@@ -449,6 +449,78 @@ TEST_F(RemapperTest, FuseMatMulWithBias) {
   test::ExpectTensorNear<float>(tensors[0], tensors_expected[0], 1e-6);
 }
 
+TEST_F(RemapperTest, FuseConv2DWithBiasAndActivationOnGPU) {
+#if !(GOOGLE_CUDA)
+  GTEST_SKIP() << "No CUDA, skip FuseConv2DWithBiasAndActivation on GPU";
+#endif  // !GOOGLE_CUDA
+  using ::tensorflow::ops::Placeholder;
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+
+  auto input_shape = Placeholder::Shape({8, 32, 32, 3});
+  auto filter_shape = Placeholder::Shape({3, 3, 3, 128});
+  auto bias_shape = Placeholder::Shape({128});
+
+  auto input = Placeholder(s.WithOpName("input"), DT_FLOAT, input_shape);
+  auto filter = Placeholder(s.WithOpName("filter"), DT_FLOAT, filter_shape);
+  auto bias = Placeholder(s.WithOpName("bias"), DT_FLOAT, bias_shape);
+
+  std::vector<int> strides = {1, 1, 1, 1};
+  auto conv = ops::Conv2D(s.WithOpName("conv"), input, filter, strides, "SAME");
+  auto bias_add = ops::BiasAdd(s.WithOpName("bias_add"), conv, bias);
+
+  ops::Identity fetch = [&]() -> ops::Identity {
+    auto activate = s.WithOpName("activation");
+    auto fetch = s.WithOpName("fetch");
+    return ops::Identity(fetch, ops::Relu(activate, bias_add));
+  }();
+
+  auto input_t = GenerateRandomTensor<DT_FLOAT>({8, 32, 32, 3});
+  auto filter_t = GenerateRandomTensor<DT_FLOAT>({3, 3, 3, 128});
+  auto bias_t = GenerateRandomTensor<DT_FLOAT>({128});
+
+  GrapplerItem item;
+  item.fetch = {"fetch"};
+  item.feed = {{"input", input_t}, {"filter", filter_t}, {"bias", bias_t}};
+  TF_ASSERT_OK(s.ToGraphDef(&item.graph));
+
+  // Place all nodes on GPU.
+  for (int i = 0; i < item.graph.node_size(); ++i) {
+    item.graph.mutable_node(i)->set_device("/device:GPU:0");
+  }
+
+  Remapper optimizer(RewriterConfig::AGGRESSIVE);  // trust placeholders shape
+  GraphDef output;
+  TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
+
+  int found = 0;
+  for (const NodeDef& node : output.node()) {
+    if (node.name() == "activation") {
+      EXPECT_EQ(node.op(), "_FusedConv2D");
+      ASSERT_GE(node.input_size(), 3);
+      EXPECT_EQ(node.input(0), "input");
+      EXPECT_EQ(node.input(1), "filter");
+
+      EXPECT_EQ(node.attr().at("num_args").i(), 1);
+      EXPECT_EQ(node.input(2), "bias");
+
+      const auto fused_ops = node.attr().at("fused_ops").list().s();
+      ASSERT_EQ(fused_ops.size(), 2);
+      EXPECT_EQ(fused_ops[0], "BiasAdd");
+      EXPECT_EQ(fused_ops[1], "Relu");
+      found++;
+    }
+  }
+  EXPECT_EQ(found, 1);
+
+  if (GetNumAvailableGPUs() > 0) {
+    auto tensors_expected = EvaluateNodes(item.graph, item.fetch, item.feed);
+    ASSERT_EQ(tensors_expected.size(), 1);
+    auto tensors = EvaluateNodes(output, item.fetch, item.feed);
+    ASSERT_EQ(tensors.size(), 1);
+    test::ExpectTensorNear<float>(tensors[0], tensors_expected[0], 1e-6);
+  }
+}
+
 TEST_F(RemapperTest, FuseConv2DWithBiasAndActivation) {
   using ::tensorflow::ops::Placeholder;
 
diff --git a/tensorflow/core/grappler/optimizers/shape_optimizer.cc b/tensorflow/core/grappler/optimizers/shape_optimizer.cc
index 69de1cde4ca..656c1a1db1c 100644
--- a/tensorflow/core/grappler/optimizers/shape_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/shape_optimizer.cc
@@ -99,7 +99,8 @@ Status ShapeOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
         }
         const auto& prop =
             properties.GetOutputProperties(reduce_indices.node->name());
-        if (prop.size() <= reduce_indices.port_id) {
+        int prop_size = prop.size();
+        if (prop_size <= reduce_indices.port_id) {
           continue;
         }
         const TensorShapeProto& reduction_indices_shape =
diff --git a/tensorflow/core/grappler/utils.cc b/tensorflow/core/grappler/utils.cc
index cd6b4855583..deed2591001 100644
--- a/tensorflow/core/grappler/utils.cc
+++ b/tensorflow/core/grappler/utils.cc
@@ -357,7 +357,8 @@ void PermuteNodesInPlace(GraphDef* graph, std::vector<int>* permutation,
     }
     permutation->swap(inv_perm);
   }
-  for (std::size_t n = 0; n + 1 < permutation->size(); ++n) {
+  for (int n = 0, permutation_size = permutation->size();
+       n + 1 < permutation_size; ++n) {
     while (n != (*permutation)[n]) {
       std::size_t r = (*permutation)[n];
       graph->mutable_node()->SwapElements(n, r);
diff --git a/tensorflow/core/grappler/utils/graph_view.cc b/tensorflow/core/grappler/utils/graph_view.cc
index 5a9a1cd2abb..891c57d1e86 100644
--- a/tensorflow/core/grappler/utils/graph_view.cc
+++ b/tensorflow/core/grappler/utils/graph_view.cc
@@ -63,7 +63,7 @@ bool NodeView::HasFanout(const FaninView& fanout) const {
     return false;
   } else if (fanout.index() == Graph::kControlSlot) {
     return view->fanins_set_.contains({this->node(), Graph::kControlSlot});
-  } else if (fanout.index() >= view->regular_fanins_.size()) {
+  } else if (fanout.index() >= static_cast<int>(view->regular_fanins_.size())) {
     return false;
   }
   return view->regular_fanins_[fanout.index()].node_index_ == node_index_;
@@ -152,8 +152,9 @@ Status GraphView::CheckAndAddFaninsInternal(NodeView* node_view) {
                                      Graph::kControlSlot);
       has_observed_control = true;
     } else {
-      if (fanin_node_view.regular_fanouts_by_port_.size() <
-          fanin_id.index() + 1) {
+      int fanin_node_view_regular_fanouts_by_port_size =
+          fanin_node_view.regular_fanouts_by_port_.size();
+      if (fanin_node_view_regular_fanouts_by_port_size < fanin_id.index() + 1) {
         fanin_node_view.regular_fanouts_by_port_.resize(fanin_id.index() + 1);
       }
       fanin_node_view.regular_fanouts_by_port_[fanin_id.index()].emplace_back(
@@ -197,7 +198,7 @@ bool MutableNodeView::HasFanout(const MutableFaninView& fanout) const {
     return false;
   } else if (fanout.index() == Graph::kControlSlot) {
     return view->fanins_count_.contains({this->node(), Graph::kControlSlot});
-  } else if (fanout.index() >= view->regular_fanins_.size()) {
+  } else if (fanout.index() >= static_cast<int>(view->regular_fanins_.size())) {
     return false;
   }
   return view->regular_fanins_[fanout.index()].node_index_ == node_index_;
@@ -279,7 +280,8 @@ void Mutation::AddMutation(
 void Mutation::RemoveNode(MutableNodeView* node) {
   auto& update_index = node->update_index_;
   if (update_index != internal::kMissingIndex) {
-    if (update_index < updated_nodes_.size() - 1) {
+    int updated_nodes_size = updated_nodes_.size();
+    if (update_index < updated_nodes_size - 1) {
       graph_view_->nodes_[updated_nodes_.back().node_index].update_index_ =
           update_index;
       std::swap(updated_nodes_[update_index], updated_nodes_.back());
@@ -574,7 +576,9 @@ void MutableGraphView::AddFaninsInternal(
           --last_pos;
         }
       } else {
-        if (fanin_node_view.regular_fanouts_by_port_.size() <
+        int fanin_node_view_regular_fanouts_by_port_size =
+            fanin_node_view.regular_fanouts_by_port_.size();
+        if (fanin_node_view_regular_fanouts_by_port_size <
             fanin_id.index() + 1) {
           fanin_node_view.regular_fanouts_by_port_.resize(fanin_id.index() + 1);
         }
@@ -852,8 +856,10 @@ template <typename T>
 void MutableGraphView::ReplaceNodeFanouts(MutableNodeView* node, T* fanouts) {
   node->num_regular_fanouts_ = fanouts->num_regular_fanouts_;
   node->regular_fanouts_by_port_ = std::move(fanouts->regular_fanouts_by_port_);
-  for (int i = 0; i < node->regular_fanouts_by_port_.size(); ++i) {
-    for (int j = 0; j < node->regular_fanouts_by_port_[i].size(); ++j) {
+  for (int i = 0, i_max = node->regular_fanouts_by_port_.size(); i < i_max;
+       ++i) {
+    for (int j = 0, j_max = node->regular_fanouts_by_port_[i].size(); j < j_max;
+         ++j) {
       auto& fanout = node->regular_fanouts_by_port_[i][j];
       auto* fanout_node_view = fanout.node_view();
       auto& fanout_fanin = fanout_node_view->regular_fanins_[fanout.index()];
@@ -868,7 +874,7 @@ void MutableGraphView::ReplaceNodeFanouts(MutableNodeView* node, T* fanouts) {
     }
   }
   node->controlled_fanouts_ = std::move(fanouts->controlled_fanouts_);
-  for (int i = 0; i < node->controlled_fanouts_.size(); ++i) {
+  for (int i = 0, i_max = node->controlled_fanouts_.size(); i < i_max; ++i) {
     auto& fanout = node->controlled_fanouts_[i];
     auto* fanout_node_view = fanout.node_view();
     auto& fanout_fanin =
@@ -1017,7 +1023,8 @@ inline void MutableGraphView::RemoveRegularFaninFanoutInternal(
                       {&graph_->node(fanin.node_index_), fanin.index()});
   auto* fanin_node_view = fanin.node_view();
   auto& fanouts = fanin_node_view->regular_fanouts_by_port_[fanin.index()];
-  if (fanin.fanout_index_ < fanouts.size() - 1) {
+  int fanouts_size = fanouts.size();
+  if (fanin.fanout_index_ < fanouts_size - 1) {
     // Swap fanout with last fanout in vector, and update it's associated fanin
     // index.
     MutableFaninView& last_fanout = fanouts.back();
@@ -1043,7 +1050,9 @@ inline void MutableGraphView::RemoveRegularFaninFanoutInternal(
       break;
     }
   }
-  if (last_fanout_index < fanin_node_view->regular_fanouts_by_port_.size()) {
+  int fanin_node_view_regular_fanouts_by_port_size =
+      fanin_node_view->regular_fanouts_by_port_.size();
+  if (last_fanout_index < fanin_node_view_regular_fanouts_by_port_size) {
     fanin_node_view->regular_fanouts_by_port_.resize(last_fanout_index);
   }
 }
@@ -1052,7 +1061,9 @@ inline void MutableGraphView::AddRegularFaninInternal(
     MutableNodeView* node_view, const SafeTensorId& fanin_id) {
   MutableNodeView* fanin_node_view = GetNode(fanin_id.node());
   // Resize fanouts to include new output port index.
-  if (fanin_node_view->regular_fanouts_by_port_.size() < fanin_id.index() + 1) {
+  int fanin_node_view_regular_fanouts_by_port_size =
+      fanin_node_view->regular_fanouts_by_port_.size();
+  if (fanin_node_view_regular_fanouts_by_port_size < fanin_id.index() + 1) {
     fanin_node_view->regular_fanouts_by_port_.resize(fanin_id.index() + 1);
   }
 
@@ -1078,7 +1089,9 @@ inline void MutableGraphView::UpdateRegularFaninInternal(
 
   MutableNodeView* fanin_node_view = GetNode(fanin_id.node());
   // Resize fanouts to include new output port index.
-  if (fanin_node_view->regular_fanouts_by_port_.size() < fanin_id.index() + 1) {
+  int fanin_node_view_regular_fanouts_by_port_size =
+      fanin_node_view->regular_fanouts_by_port_.size();
+  if (fanin_node_view_regular_fanouts_by_port_size < fanin_id.index() + 1) {
     fanin_node_view->regular_fanouts_by_port_.resize(fanin_id.index() + 1);
   }
 
@@ -1110,8 +1123,10 @@ inline void MutableGraphView::RemoveControllingFaninFanoutInternal(
     // controlled fanout in controlling fanin with controlled fanout to be
     // removed.
     auto* control_to_remove_view = control_to_remove.node_view();
+    int control_to_remove_view_controlled_fanouts_size =
+        control_to_remove_view->controlled_fanouts_.size();
     if (control_to_remove.fanout_index_ <
-        control_to_remove_view->controlled_fanouts_.size() - 1) {
+        control_to_remove_view_controlled_fanouts_size - 1) {
       auto& control_to_remove_view_last_control =
           control_to_remove_view->controlled_fanouts_.back();
       control_to_remove_view_last_control.node_view()
@@ -1137,7 +1152,9 @@ inline void MutableGraphView::RemoveControllingFaninInternal(
     RemoveControllingFaninFanoutInternal(node_view, control_index);
 
     // Swap last controlling fanin in node with controlling fanin to be removed.
-    if (control_index < node_view->controlling_fanins_.size() - 1) {
+    int node_view_controlling_fanins_size =
+        node_view->controlling_fanins_.size();
+    if (control_index < node_view_controlling_fanins_size - 1) {
       auto& last_control = node_view->controlling_fanins_.back();
       auto* last_control_view = last_control.node_view();
       last_control_view->controlled_fanouts_[last_control.fanout_index_]
diff --git a/tensorflow/core/grappler/utils/graph_view_internal.h b/tensorflow/core/grappler/utils/graph_view_internal.h
index d07f9f71640..d66b1ca0452 100644
--- a/tensorflow/core/grappler/utils/graph_view_internal.h
+++ b/tensorflow/core/grappler/utils/graph_view_internal.h
@@ -172,7 +172,8 @@ class NodeViewInternal {
   // Returns a regular fanin based on input index. If no such fanin exist, a
   // missing fanin is returned, with no NodeView set and an index of -2.
   const FanoutViewT& GetRegularFanin(int i) const {
-    if (i < 0 || i >= regular_fanins_.size()) {
+    int regular_fanins_size = regular_fanins_.size();
+    if (i < 0 || i >= regular_fanins_size) {
       return GetMissingFanin();
     }
     return regular_fanins_[i];
@@ -191,7 +192,8 @@ class NodeViewInternal {
   // Returns a regular fanout(s) based on output index. If no such output index
   // exists, no fanouts will be returned.
   const std::vector<FaninViewT>& GetRegularFanout(int i) const {
-    if (i < 0 || i >= regular_fanouts_by_port_.size()) {
+    int regular_fanouts_by_port_size = regular_fanouts_by_port_.size();
+    if (i < 0 || i >= regular_fanouts_by_port_size) {
       return GetMissingFanout();
     }
     return regular_fanouts_by_port_[i];
@@ -289,14 +291,16 @@ class GraphViewInternal {
   // Finds node by index in the graph. If no such node exists in the graph, a
   // `nullptr` is returned.
   const NodeViewT* GetNode(int node_index) const {
-    if (node_index < 0 || node_index >= nodes_.size()) {
+    int nodes_size = nodes_.size();
+    if (node_index < 0 || node_index >= nodes_size) {
       return nullptr;
     }
     return &nodes_[node_index];
   }
 
   NodeViewT* GetNode(int node_index) {
-    if (node_index < 0 || node_index >= nodes_.size()) {
+    int nodes_size = nodes_.size();
+    if (node_index < 0 || node_index >= nodes_size) {
       return nullptr;
     }
     return &nodes_[node_index];
@@ -444,13 +448,14 @@ inline bool UpdateDevice(NodeViewDiff<GraphViewT>* diff,
 template <typename T, typename U>
 inline bool AddOrUpdateAtIndex(std::vector<T>* v, int i, const U& value,
                                const T& default_value) {
-  if (i > v->size()) {
+  int v_size = v->size();
+  if (i > v_size) {
     // Resize to include `value`, filling the newly introduced gap with
     // `default_value` for later checks of validity (gaps in vector).
     v->reserve(i + 1);
     v->resize(i, default_value);
     v->push_back({value});
-  } else if (i == v->size()) {
+  } else if (i == v_size) {
     // Vector is large enough, simply append `value` to the end.
     v->push_back({value});
   } else {
@@ -494,7 +499,9 @@ inline bool AddOrUpdateRegularFanin(NodeViewDiff<GraphViewT>* diff, int index,
     // index from beginning of regular fanins.
     const int relative_removal_index = num_regular_fanins - index - 1;
     // Check if at relative index fanin was already marked for removal.
-    if (relative_removal_index < diff->regular_inputs_to_remove.size() &&
+    int diff_regular_inputs_to_remove_size =
+        diff->regular_inputs_to_remove.size();
+    if (relative_removal_index < diff_regular_inputs_to_remove_size &&
         diff->regular_inputs_to_remove[relative_removal_index]) {
       // Unmark fanin for removal.
       diff->regular_inputs_to_remove[relative_removal_index] = false;
@@ -543,7 +550,8 @@ inline bool RemoveRegularFanin(NodeViewDiff<GraphViewT>* diff, int index) {
   } else {
     // Relative index from end of regular fanins.
     const int relative_add_index = index - num_regular_fanins;
-    if (relative_add_index >= diff->regular_inputs_to_add.size() ||
+    int diff_regular_inputs_to_add_size = diff->regular_inputs_to_add.size();
+    if (relative_add_index >= diff_regular_inputs_to_add_size ||
         IsEmptyTensorId(diff->regular_inputs_to_add[relative_add_index])) {
       // At relative index, appended regular fanin was already marked for
       // removal.
@@ -671,7 +679,8 @@ inline bool IsWellFormed(
     const absl::flat_hash_map<absl::string_view, int>& updated_node_names) {
   ResizeByTrimmingEndForValue(&diff->regular_inputs_to_remove, false);
   ResizeByTrimmingEndForValue(&diff->regular_inputs_to_add, EmptyTensorId());
-  if (diff->regular_inputs_to_add.size() != diff->num_regular_inputs_to_add) {
+  int diff_regular_inputs_to_add_size = diff->regular_inputs_to_add.size();
+  if (diff_regular_inputs_to_add_size != diff->num_regular_inputs_to_add) {
     // Missing regular fanins in between appended fanins.
     return false;
   } else if (diff->num_regular_inputs_to_add > 0 &&
@@ -679,7 +688,7 @@ inline bool IsWellFormed(
     // Appending new fanins while removing existing fanins, resulting in missing
     // regular fanins in between.
     return false;
-  } else if (diff->regular_inputs_to_remove.size() !=
+  } else if (static_cast<int>(diff->regular_inputs_to_remove.size()) !=
              diff->num_regular_inputs_to_remove) {
     // Regular fanins exist in between removed fanins.
     return false;
@@ -830,7 +839,8 @@ inline void AddOrUpdateRegularFanin(NewNode<GraphViewT>* new_node, int index,
 // remove existing fanins and updated/added fanins via AddOrUpdateRegularFanins.
 template <typename GraphViewT>
 inline void RemoveRegularFanin(NewNode<GraphViewT>* new_node, int index) {
-  if (index < 0 || index >= new_node->regular_fanins.size() ||
+  int new_node_regular_fanins_size = new_node->regular_fanins.size();
+  if (index < 0 || index >= new_node_regular_fanins_size ||
       IsEmptyTensorId(new_node->regular_fanins[index])) {
     return;
   }
@@ -874,7 +884,8 @@ inline bool IsWellFormed(
     NewNode<GraphViewT>* new_node,
     const absl::flat_hash_map<absl::string_view, int>& updated_node_names) {
   ResizeByTrimmingEndForValue(&new_node->regular_fanins, EmptyTensorId());
-  if (new_node->regular_fanins.size() != new_node->num_regular_fanins) {
+  int new_node_regular_fanins_size = new_node->regular_fanins.size();
+  if (new_node_regular_fanins_size != new_node->num_regular_fanins) {
     return false;
   }
 
diff --git a/tensorflow/core/grappler/utils/topological_sort.cc b/tensorflow/core/grappler/utils/topological_sort.cc
index e24a457593a..a7bef1c7014 100644
--- a/tensorflow/core/grappler/utils/topological_sort.cc
+++ b/tensorflow/core/grappler/utils/topological_sort.cc
@@ -81,7 +81,8 @@ Status ComputeTopologicalOrder(
     int ready_node = (*ready_nodes)[front];
     for (int fanout : graph_view.GetFanout(ready_node)) {
       ++num_ready_inputs[fanout];
-      if (num_ready_inputs[fanout] == graph_view.GetFanin(fanout).size()) {
+      if (num_ready_inputs[fanout] ==
+          static_cast<int>(graph_view.GetFanin(fanout).size())) {
         ready_nodes->push_back(fanout);
         ++back;
       }
@@ -95,7 +96,8 @@ Status ComputeTopologicalOrder(
                  "at node = "
               << graph.node(back).DebugString();
       for (int i = 0; i < graph_view.num_nodes(); ++i) {
-        if (num_ready_inputs[i] != graph_view.GetFanin(i).size()) {
+        if (num_ready_inputs[i] !=
+            static_cast<int>(graph_view.GetFanin(i).size())) {
           VLOG(1) << "Node not ready: " << graph.node(i).DebugString();
         }
       }
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index b2b54adbcf9..e2ff5aed283 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -3577,6 +3577,7 @@ tf_cc_tests(
 cc_library(
     name = "linalg",
     deps = [
+        ":banded_triangular_solve_op",
         ":cholesky_grad",
         ":cholesky_op",
         ":determinant_op",
@@ -3750,6 +3751,12 @@ tf_kernel_library(
     deps = LINALG_DEPS,
 )
 
+tf_kernel_library(
+    name = "banded_triangular_solve_op",
+    prefix = "banded_triangular_solve_op",
+    deps = LINALG_DEPS + [":fill_functor"],
+)
+
 tf_kernel_library(
     name = "matrix_triangular_solve_op",
     hdrs = ["matrix_triangular_solve_op_impl.h"],
@@ -4161,6 +4168,25 @@ tf_kernel_library(
     ]),
 )
 
+tf_cuda_cc_test(
+    name = "mlir_generated_op_gpu_tanh_test",
+    size = "small",
+    srcs = if_mlir_generated_gpu_kernels_enabled(["mlir_generated_op_gpu_tanh_test.cc"]),
+    tags = tf_cuda_tests_tags() + ["no_rocm"],
+    deps = [
+        ":cwise_op",
+        ":ops_testutil",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:tensorflow",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "//tensorflow/core/common_runtime:device",
+        "//tensorflow/core/common_runtime:device_factory",
+    ],
+)
+
 tf_kernel_library(
     name = "nextafter_op",
     prefix = "nextafter_op",
@@ -4425,6 +4451,26 @@ tf_cuda_cc_test(
     ],
 )
 
+tf_cuda_cc_test(
+    name = "banded_triangular_solve_op_test",
+    size = "small",
+    srcs = ["banded_triangular_solve_op_test.cc"],
+    deps = [
+        ":banded_triangular_solve_op",
+        ":matrix_set_diag_op",
+        ":matrix_triangular_solve_op",
+        ":ops_testutil",
+        ":ops_util",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
 tf_cuda_cc_test(
     name = "matrix_triangular_solve_op_test",
     size = "small",
@@ -4873,7 +4919,9 @@ tf_kernel_library(
         "topk_op_gpu_double.cu.cc",
         "topk_op_gpu_float.cu.cc",
         "topk_op_gpu_half.cu.cc",
+        "topk_op_gpu_uint64.cu.cc",
         "topk_op_gpu_int64.cu.cc",
+        "topk_op_gpu_uint32.cu.cc",
         "topk_op_gpu_int32.cu.cc",
         "topk_op_gpu_int16.cu.cc",
         "topk_op_gpu_uint16.cu.cc",
@@ -6129,6 +6177,7 @@ tf_kernel_library(
     ]),
     prefix = "parameterized_truncated_normal_op",
     deps = [
+        ":stateless_random_ops",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
@@ -6774,7 +6823,8 @@ filegroup(
         "cwise_op_minimum.cc",
         "cwise_op_mul_1.cc",
         "cwise_op_mul_2.cc",
-        "cwise_op_neg.cc",
+        "cwise_op_neg_1.cc",
+        "cwise_op_neg_2.cc",
         "cwise_op_pow.cc",
         "cwise_op_real.cc",
         "cwise_op_reciprocal.cc",
@@ -8752,7 +8802,8 @@ exports_files([
     "cwise_op_mod.cc",
     "cwise_op_mul_1.cc",
     "cwise_op_mul_2.cc",
-    "cwise_op_neg.cc",
+    "cwise_op_neg_1.cc",
+    "cwise_op_neg_2.cc",
     "cwise_op_not_equal_to_1.cc",
     "cwise_op_not_equal_to_2.cc",
     "cwise_op_round.cc",
diff --git a/tensorflow/core/kernels/banded_triangular_solve_op.cc b/tensorflow/core/kernels/banded_triangular_solve_op.cc
new file mode 100644
index 00000000000..d01a015502a
--- /dev/null
+++ b/tensorflow/core/kernels/banded_triangular_solve_op.cc
@@ -0,0 +1,293 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// See docs in ../ops/linalg_ops.cc.
+
+#include "third_party/eigen3/Eigen/Core"
+#include "tensorflow/core/framework/kernel_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/kernels/fill_functor.h"
+#include "tensorflow/core/kernels/linalg_ops_common.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/matmul_bcast.h"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+
+template <typename Scalar>
+Scalar eigen_conj(const Scalar& scalar) {
+  return Eigen::numext::conj<Scalar>(scalar);
+}
+
+// Sequential batch matrix triangular solve kernel that calls Eigen's
+// matrix triangular solve.
+template <typename Scalar>
+struct SequentialBandedTriangularSolveKernel {
+  using Matrix =
+      Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
+  using ConstMatrixMap = Eigen::Map<const Matrix>;
+  using MatrixMap = Eigen::Map<Matrix>;
+  using RealScalar = typename Eigen::NumTraits<Scalar>::Real;
+
+  static ConstMatrixMap ConstTensorSliceToEigenMatrix(const Tensor& t,
+                                                      int slice) {
+    return ConstMatrixMap(
+        t.flat<Scalar>().data() + slice * t.dim_size(1) * t.dim_size(2),
+        t.dim_size(1), t.dim_size(2));
+  }
+
+  static MatrixMap TensorSliceToEigenMatrix(Tensor* t, int slice) {
+    return MatrixMap(
+        t->flat<Scalar>().data() + slice * t->dim_size(1) * t->dim_size(2),
+        t->dim_size(1), t->dim_size(2));
+  }
+
+  static void Run(const Tensor& in_x, const Tensor& in_y, bool lower,
+                  bool adjoint, const MatMulBCast& bcast, Tensor* out,
+                  int start, int limit) {
+    const bool should_bcast = bcast.IsBroadcastingRequired();
+    const auto& x_batch_indices = bcast.x_batch_indices();
+    const auto& y_batch_indices = bcast.y_batch_indices();
+    int num_bands = in_x.dim_size(1);
+    int matrix_size = in_x.dim_size(2);
+
+    for (int64 i = start; i < limit; ++i) {
+      const int64 x_batch_index = should_bcast ? x_batch_indices[i] : i;
+      const int64 y_batch_index = should_bcast ? y_batch_indices[i] : i;
+      auto matrix = ConstTensorSliceToEigenMatrix(in_x, x_batch_index);
+      auto rhs = ConstTensorSliceToEigenMatrix(in_y, y_batch_index);
+      auto output = TensorSliceToEigenMatrix(out, i);
+      // Below, we use the standard algorithm for computing a triangular solve,
+      // except we band limit it.
+      // Given A x = b, where A is lower triangular,
+      // x_i = (b_i - sum a_ij * x_j) / a_ii, where the sum is from
+      // j = 0 to i - 1.
+      //
+      // Now, in a banded triangular matrix, when i exceeds the band size,
+      // then the sum goes from j = i - band_size to i - 1, since the other
+      // elements are zero.
+      //
+      // Finally, given the band storage format, we'll need to change the
+      // indexing.
+      if (lower) {
+        if (!adjoint) {
+          output.row(0) = rhs.row(0) / matrix(0, 0);
+          for (int i = 1; i < matrix_size; ++i) {
+            if (i < num_bands) {
+              output.row(i).noalias() =
+                  (rhs.row(i) - matrix.block(1, i, i, 1).reverse().transpose() *
+                                    output.topRows(i)) /
+                  matrix(0, i);
+            } else {
+              output.row(i).noalias() =
+                  (rhs.row(i) -
+                   matrix.block(1, i, num_bands - 1, 1).reverse().transpose() *
+                       output.middleRows(i - (num_bands - 1), num_bands - 1)) /
+                  matrix(0, i);
+            }
+          }
+        } else {
+          // In the adjoint case, here and below, we now have an upper (lower)
+          // triangular matrix, and thus need to work through with the other
+          // case. We can't simply conjugate `matrix` and use the upper (lower)
+          // algorithm because the band storage format for upper and lower
+          // triangular matrices are different (in the lower case, we pad
+          // entries on the left, and in the upper case we pad entries on the
+          // right.
+          output.row(matrix_size - 1) =
+              rhs.row(matrix_size - 1) / eigen_conj(matrix(0, matrix_size - 1));
+          for (int i = matrix_size - 1; i >= 0; --i) {
+            output.row(i).noalias() = rhs.row(i);
+            for (int j = i + 1; j < std::min(matrix_size, i + num_bands); ++j) {
+              output.row(i).noalias() -=
+                  eigen_conj(matrix(j - i, j)) * output.row(j);
+            }
+            output.row(i) /= eigen_conj(matrix(0, i));
+          }
+        }
+      } else {
+        if (!adjoint) {
+          output.row(matrix_size - 1) =
+              rhs.row(matrix_size - 1) / matrix(num_bands - 1, matrix_size - 1);
+          for (int i = 1; i < matrix_size; ++i) {
+            int k = matrix_size - 1 - i;
+            if (i < num_bands) {
+              output.row(k).noalias() =
+                  (rhs.row(k) - matrix.block(num_bands - 1 - i, k, i, 1)
+                                        .reverse()
+                                        .transpose() *
+                                    output.bottomRows(i)) /
+                  matrix(num_bands - 1, k);
+            } else {
+              output.row(k).noalias() =
+                  (rhs.row(k) -
+                   matrix.block(0, k, num_bands - 1, 1).reverse().transpose() *
+                       output.middleRows(k + 1, num_bands - 1)) /
+                  matrix(num_bands - 1, k);
+            }
+          }
+        } else {
+          output.row(0) = rhs.row(0) / eigen_conj(matrix(num_bands - 1, 0));
+          for (int i = 1; i < matrix_size; ++i) {
+            output.row(i).noalias() = rhs.row(i);
+            for (int j = std::max(0, i - (num_bands - 1)); j < i; ++j) {
+              output.row(i).noalias() -=
+                  eigen_conj(matrix(num_bands - 1 - (i - j), j)) *
+                  output.row(j);
+            }
+            output.row(i) /= eigen_conj(matrix(num_bands - 1, i));
+          }
+        }
+      }
+    }
+  }
+};
+
+template <typename Scalar>
+struct LaunchBatchBandedTriangularSolve;
+
+template <typename Scalar>
+struct LaunchBatchBandedTriangularSolve {
+  static void Launch(OpKernelContext* context, const Tensor& in_x,
+                     const Tensor& in_y, bool adjoint, bool lower,
+                     const MatMulBCast& bcast, Tensor* out) {
+    // Number of banded matrix triangular solves i.e. size of the batch.
+    const int64 batch_size = bcast.output_batch_size();
+    const int64 cost_per_unit =
+        in_x.dim_size(1) * in_x.dim_size(2) * in_y.dim_size(2);
+    auto worker_threads = *(context->device()->tensorflow_cpu_worker_threads());
+
+    using Matrix =
+        Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
+    using ConstMatrixMap = Eigen::Map<const Matrix>;
+    using RealScalar = typename Eigen::NumTraits<Scalar>::Real;
+    // Check diagonal before doing any solves. This is the first row in the
+    // lower case and else is the last row.
+    auto matrix = ConstMatrixMap(in_x.flat<Scalar>().data(), in_x.dim_size(1),
+                                 in_x.dim_size(2));
+    RealScalar min_abs_pivot;
+    if (lower) {
+      min_abs_pivot = matrix.row(0).cwiseAbs().minCoeff();
+    } else {
+      min_abs_pivot = matrix.row(in_x.dim_size(1) - 1).cwiseAbs().minCoeff();
+    }
+    OP_REQUIRES(context, min_abs_pivot > RealScalar(0),
+                errors::InvalidArgument("Input matrix is not invertible."));
+
+    Shard(worker_threads.num_threads, worker_threads.workers, batch_size,
+          cost_per_unit,
+          [&in_x, &in_y, adjoint, lower, &bcast, out](int start, int limit) {
+            SequentialBandedTriangularSolveKernel<Scalar>::Run(
+                in_x, in_y, lower, adjoint, bcast, out, start, limit);
+          });
+  }
+};
+
+template <typename Scalar>
+class BandedTriangularSolveOpCpu : public OpKernel {
+ public:
+  explicit BandedTriangularSolveOpCpu(OpKernelConstruction* context)
+      : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("lower", &lower_));
+    OP_REQUIRES_OK(context, context->GetAttr("adjoint", &adjoint_));
+  }
+
+  ~BandedTriangularSolveOpCpu() override {}
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor& in0 = ctx->input(0);
+    const Tensor& in1 = ctx->input(1);
+
+    ValidateInputTensors(ctx, in0, in1);
+
+    MatMulBCast bcast(in0.shape().dim_sizes(), in1.shape().dim_sizes());
+    OP_REQUIRES(
+        ctx, bcast.IsValid(),
+        errors::InvalidArgument(
+            "In[0] and In[1] must have compatible batch dimensions: ",
+            in0.shape().DebugString(), " vs. ", in1.shape().DebugString()));
+
+    TensorShape out_shape = bcast.output_batch_shape();
+    auto batch_size = bcast.output_batch_size();
+    auto d0 = in0.dim_size(in0.dims() - 2);  // Band size.
+    auto d1 = in0.dim_size(in0.dims() - 1);
+    Tensor in0_reshaped;
+    OP_REQUIRES(
+        ctx,
+        in0_reshaped.CopyFrom(in0, TensorShape({bcast.x_batch_size(), d0, d1})),
+        errors::Internal("Failed to reshape In[0] from ",
+                         in0.shape().DebugString()));
+    auto d2 = in1.dim_size(in1.dims() - 2);
+    auto d3 = in1.dim_size(in1.dims() - 1);
+    Tensor in1_reshaped;
+    OP_REQUIRES(
+        ctx,
+        in1_reshaped.CopyFrom(in1, TensorShape({bcast.y_batch_size(), d2, d3})),
+        errors::Internal("Failed to reshape In[1] from ",
+                         in1.shape().DebugString()));
+    OP_REQUIRES(ctx, d1 == d2,
+                errors::InvalidArgument(
+                    "In[0] mismatch In[1] shape: ", d1, " vs. ", d2, ": ",
+                    in0.shape().DebugString(), " ", in1.shape().DebugString(),
+                    " ", lower_, " ", adjoint_));
+    out_shape.AddDim(d1);
+    out_shape.AddDim(d3);
+    Tensor* out = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, out_shape, &out));
+    if (out->NumElements() == 0) {
+      return;
+    }
+    Tensor out_reshaped;
+    OP_REQUIRES(ctx,
+                out_reshaped.CopyFrom(*out, TensorShape({batch_size, d1, d3})),
+                errors::Internal("Failed to reshape output from ",
+                                 out->shape().DebugString()));
+    LaunchBatchBandedTriangularSolve<Scalar>::Launch(
+        ctx, in0_reshaped, in1_reshaped, adjoint_, lower_, bcast,
+        &out_reshaped);
+  }
+
+ private:
+  void ValidateInputTensors(OpKernelContext* ctx, const Tensor& in0,
+                            const Tensor& in1) {
+    OP_REQUIRES(
+        ctx, in0.dims() >= 2,
+        errors::InvalidArgument("In[0] ndims must be >= 2: ", in0.dims()));
+
+    OP_REQUIRES(
+        ctx, in1.dims() >= 2,
+        errors::InvalidArgument("In[1] ndims must be >= 2: ", in1.dims()));
+  }
+  bool lower_;
+  bool adjoint_;
+};
+
+#define REGISTER_BANDED_TRIANGULAR_SOLVE_CPU(TYPE)        \
+  REGISTER_KERNEL_BUILDER(Name("BandedTriangularSolve")   \
+                              .Device(DEVICE_CPU)         \
+                              .TypeConstraint<TYPE>("T"), \
+                          BandedTriangularSolveOpCpu<TYPE>);
+
+REGISTER_BANDED_TRIANGULAR_SOLVE_CPU(float);
+REGISTER_BANDED_TRIANGULAR_SOLVE_CPU(double);
+REGISTER_BANDED_TRIANGULAR_SOLVE_CPU(complex64);
+REGISTER_BANDED_TRIANGULAR_SOLVE_CPU(complex128);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/banded_triangular_solve_op_test.cc b/tensorflow/core/kernels/banded_triangular_solve_op_test.cc
new file mode 100644
index 00000000000..37e904a3e0e
--- /dev/null
+++ b/tensorflow/core/kernels/banded_triangular_solve_op_test.cc
@@ -0,0 +1,180 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/graph/node_builder.h"
+#include "tensorflow/core/graph/testlib.h"
+#include "tensorflow/core/kernels/matrix_set_diag_op.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+
+namespace tensorflow {
+namespace {
+
+Node* SetDiag(int num_bands, Graph* g, Node* bands, Node* triangular) {
+  Node* ret;
+  Tensor bandwidth(DT_INT32, TensorShape({2}));
+  bandwidth.flat<int32>()(0) = -(num_bands - 1);
+  bandwidth.flat<int32>()(1) = 0;
+  TF_CHECK_OK(NodeBuilder(g->NewName("n"), "MatrixSetDiagV3")
+                  .Input(triangular)
+                  .Input(bands)
+                  .Input(test::graph::Constant(g, bandwidth))
+                  .Attr("align", "RIGHT_LEFT")
+                  .Finalize(g, &ret));
+  return ret;
+}
+
+Node* BandedTriangularSolve(Graph* g, Node* in0, Node* in1) {
+  Node* ret;
+  TF_CHECK_OK(NodeBuilder(g->NewName("n"), "BandedTriangularSolve")
+                  .Input(in0)
+                  .Input(in1)
+                  .Attr("lower", true)
+                  .Attr("adjoint", false)
+                  .Finalize(g, &ret));
+  return ret;
+}
+
+Node* MatrixTriangularSolve(Graph* g, Node* in0, Node* in1) {
+  Node* ret;
+  TF_CHECK_OK(NodeBuilder(g->NewName("n"), "MatrixTriangularSolve")
+                  .Input(in0)
+                  .Input(in1)
+                  .Attr("lower", true)
+                  .Attr("adjoint", false)
+                  .Finalize(g, &ret));
+  return ret;
+}
+
+template <typename T>
+static Graph* BandedTriangularSolve(int64 num_bands, int64 n, int64 m,
+                                    bool use_banded_solver, DataType type) {
+  Graph* g = new Graph(OpRegistry::Global());
+  Tensor in0(type, TensorShape({num_bands, n}));
+  // Set diagonal to nonzero to guarantee invertibility.
+  in0.flat<T>().setRandom();
+  in0.flat<T>() =
+      in0.flat<T>().abs() + in0.flat<T>().constant(static_cast<T>(0.5));
+  Tensor in1(type, TensorShape({n, m}));
+  in1.flat<T>().setRandom();
+  if (use_banded_solver) {
+    BandedTriangularSolve(g, test::graph::Constant(g, in0),
+                          test::graph::Constant(g, in1));
+  } else {
+    // Create a zero tensor.
+    Tensor in2(type, TensorShape({n, n}));
+    in2.flat<T>().setZero();
+    Node* triangular_matrix =
+        SetDiag(num_bands, g, test::graph::Constant(g, in0),
+                test::graph::Constant(g, in2));
+    MatrixTriangularSolve(g, triangular_matrix, test::graph::Constant(g, in1));
+  }
+  return g;
+}
+
+// Macro arguments names: --------------------------------------------------- //
+//   K: Number of bands
+//   N: Inner dimension of LHS, Inner dimension of RHS.
+//   M: Outer dimensions of RHS
+//   BS: boolean indicating whether to use the banded solver
+//    T: C++ type of scalars (e.g. float, std::complex)
+//   TT: TensorFlow type of scalars (e.g. DT_FLOAT, DT_COMPLEX128
+#define BM_BandedTriangularSolveDev(K, N, M, BS, T, TT, D)                     \
+  static void BM_BandedTriangularSolve##_##K##_##N##_##M##_##BS##_##TT(        \
+      int iters) {                                                             \
+    testing::UseRealTime();                                                    \
+    testing::ItemsProcessed(static_cast<int64>(iters) * K * N + N * M);        \
+    test::Benchmark(#D, BandedTriangularSolve<T>(K, N, M, BS, TT)).Run(iters); \
+  }                                                                            \
+  BENCHMARK(BM_BandedTriangularSolve##_##K##_##N##_##M##_##BS##_##TT);
+
+#define BM_BandedTriangularSolve(K, N, M, BS, D)                \
+  BM_BandedTriangularSolveDev(K, N, M, BS, float, DT_FLOAT, D); \
+  BM_BandedTriangularSolveDev(K, N, M, BS, double, DT_DOUBLE, D);
+
+// Small number of bands, few rhs
+BM_BandedTriangularSolve(2, 32, 1, true, cpu);
+BM_BandedTriangularSolve(2, 32, 1, false, cpu);
+BM_BandedTriangularSolve(4, 32, 1, true, cpu);
+BM_BandedTriangularSolve(4, 32, 1, false, cpu);
+BM_BandedTriangularSolve(8, 32, 1, true, cpu);
+BM_BandedTriangularSolve(8, 32, 1, false, cpu);
+BM_BandedTriangularSolve(16, 32, 1, true, cpu);
+BM_BandedTriangularSolve(16, 32, 1, false, cpu);
+BM_BandedTriangularSolve(2, 128, 1, true, cpu);
+BM_BandedTriangularSolve(2, 128, 1, false, cpu);
+BM_BandedTriangularSolve(4, 128, 1, true, cpu);
+BM_BandedTriangularSolve(4, 128, 1, false, cpu);
+BM_BandedTriangularSolve(8, 128, 1, true, cpu);
+BM_BandedTriangularSolve(8, 128, 1, false, cpu);
+BM_BandedTriangularSolve(16, 128, 1, true, cpu);
+BM_BandedTriangularSolve(16, 128, 1, false, cpu);
+BM_BandedTriangularSolve(2, 512, 1, true, cpu);
+BM_BandedTriangularSolve(2, 512, 1, false, cpu);
+BM_BandedTriangularSolve(4, 512, 1, true, cpu);
+BM_BandedTriangularSolve(4, 512, 1, false, cpu);
+BM_BandedTriangularSolve(8, 512, 1, true, cpu);
+BM_BandedTriangularSolve(8, 512, 1, false, cpu);
+BM_BandedTriangularSolve(16, 512, 1, true, cpu);
+BM_BandedTriangularSolve(16, 512, 1, false, cpu);
+
+// Larger # rhs
+BM_BandedTriangularSolve(2, 32, 32, true, cpu);
+BM_BandedTriangularSolve(2, 32, 32, false, cpu);
+BM_BandedTriangularSolve(4, 32, 32, true, cpu);
+BM_BandedTriangularSolve(4, 32, 32, false, cpu);
+BM_BandedTriangularSolve(8, 32, 32, true, cpu);
+BM_BandedTriangularSolve(8, 32, 32, false, cpu);
+BM_BandedTriangularSolve(16, 32, 32, true, cpu);
+BM_BandedTriangularSolve(16, 32, 32, false, cpu);
+BM_BandedTriangularSolve(2, 128, 128, true, cpu);
+BM_BandedTriangularSolve(2, 128, 128, false, cpu);
+BM_BandedTriangularSolve(4, 128, 128, true, cpu);
+BM_BandedTriangularSolve(4, 128, 128, false, cpu);
+BM_BandedTriangularSolve(8, 128, 128, true, cpu);
+BM_BandedTriangularSolve(8, 128, 128, false, cpu);
+BM_BandedTriangularSolve(16, 128, 128, true, cpu);
+BM_BandedTriangularSolve(16, 128, 128, false, cpu);
+BM_BandedTriangularSolve(2, 512, 512, true, cpu);
+BM_BandedTriangularSolve(2, 512, 512, false, cpu);
+BM_BandedTriangularSolve(4, 512, 512, true, cpu);
+BM_BandedTriangularSolve(4, 512, 512, false, cpu);
+BM_BandedTriangularSolve(8, 512, 512, true, cpu);
+BM_BandedTriangularSolve(8, 512, 512, false, cpu);
+BM_BandedTriangularSolve(16, 512, 512, true, cpu);
+BM_BandedTriangularSolve(16, 512, 512, false, cpu);
+
+BM_BandedTriangularSolve(2, 2048, 2048, true, cpu);
+BM_BandedTriangularSolve(2, 2048, 2048, false, cpu);
+BM_BandedTriangularSolve(4, 2048, 2048, true, cpu);
+BM_BandedTriangularSolve(4, 2048, 2048, false, cpu);
+BM_BandedTriangularSolve(8, 2048, 2048, true, cpu);
+BM_BandedTriangularSolve(8, 2048, 2048, false, cpu);
+BM_BandedTriangularSolve(16, 2048, 2048, true, cpu);
+BM_BandedTriangularSolve(16, 2048, 2048, false, cpu);
+BM_BandedTriangularSolve(32, 2048, 2048, true, cpu);
+BM_BandedTriangularSolve(32, 2048, 2048, false, cpu);
+BM_BandedTriangularSolve(64, 2048, 2048, true, cpu);
+BM_BandedTriangularSolve(64, 2048, 2048, false, cpu);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/batching_util/BUILD b/tensorflow/core/kernels/batching_util/BUILD
index 370c96c6e7f..803eb2e9048 100644
--- a/tensorflow/core/kernels/batching_util/BUILD
+++ b/tensorflow/core/kernels/batching_util/BUILD
@@ -70,6 +70,7 @@ cc_library(
         ":batch_scheduler_hdrs",
         ":periodic_function_dynamic",
         "//tensorflow/core:framework_headers_lib",
+        "//tensorflow/core/profiler/lib:connected_traceme",
         "//tensorflow/core/profiler/lib:traceme",
     ],
 )
@@ -81,6 +82,7 @@ cc_library(
         ":batch_scheduler",
         ":periodic_function_dynamic",
         "//tensorflow/core:lib",
+        "//tensorflow/core/profiler/lib:connected_traceme",
         "//tensorflow/core/profiler/lib:traceme",
     ],
     alwayslink = 1,
diff --git a/tensorflow/core/kernels/batching_util/batch_scheduler.h b/tensorflow/core/kernels/batching_util/batch_scheduler.h
index e418f8acbb1..d0e1d20bed4 100644
--- a/tensorflow/core/kernels/batching_util/batch_scheduler.h
+++ b/tensorflow/core/kernels/batching_util/batch_scheduler.h
@@ -77,7 +77,8 @@ class BatchTask {
 template <typename TaskType>
 class Batch {
  public:
-  Batch() = default;
+  Batch();
+  explicit Batch(uint64 traceme_context_id);
   virtual ~Batch();  // Blocks until the batch is closed.
 
   // Appends 'task' to the batch. After calling AddTask(), the newly-added task
@@ -113,6 +114,9 @@ class Batch {
   // Marks the batch as closed. Dies if called more than once.
   void Close();
 
+  // Returns the TraceMe context id of this batch.
+  uint64 traceme_context_id() const;
+
  private:
   mutable mutex mu_;
 
@@ -125,6 +129,9 @@ class Batch {
   // Whether the batch has been closed.
   Notification closed_;
 
+  // The TracMe context id.
+  const uint64 traceme_context_id_;
+
   TF_DISALLOW_COPY_AND_ASSIGN(Batch);
 };
 
@@ -187,6 +194,13 @@ class BatchScheduler {
 //////////
 // Implementation details follow. API users need not read.
 
+template <typename TaskType>
+Batch<TaskType>::Batch() : Batch(0) {}
+
+template <typename TaskType>
+Batch<TaskType>::Batch(uint64 traceme_context_id)
+    : traceme_context_id_(traceme_context_id) {}
+
 template <typename TaskType>
 Batch<TaskType>::~Batch() {
   WaitUntilClosed();
@@ -275,6 +289,11 @@ void Batch<TaskType>::Close() {
   closed_.Notify();
 }
 
+template <typename TaskType>
+uint64 Batch<TaskType>::traceme_context_id() const {
+  return traceme_context_id_;
+}
+
 }  // namespace serving
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h b/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h
index 66bdff933d8..e47e069eff5 100644
--- a/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h
+++ b/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h
@@ -36,6 +36,7 @@ limitations under the License.
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/thread_annotations.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/profiler/lib/connected_traceme.h"
 #include "tensorflow/core/profiler/lib/traceme.h"
 
 namespace tensorflow {
@@ -311,6 +312,9 @@ class Queue {
   // The enqueued batches. See the invariants in the class comments above.
   std::deque<std::unique_ptr<Batch<TaskType>>> batches_ TF_GUARDED_BY(mu_);
 
+  // The counter of the TraceMe context ids.
+  uint64 traceme_context_id_counter_ TF_GUARDED_BY(mu_) = 0;
+
   // The time at which the first task was added to the open (back-most) batch
   // in 'batches_'. Valid iff that batch contains at least one task.
   uint64 open_batch_start_time_micros_ TF_GUARDED_BY(mu_);
@@ -529,8 +533,6 @@ Queue<TaskType>::~Queue() {
 
 template <typename TaskType>
 Status Queue<TaskType>::Schedule(std::unique_ptr<TaskType>* task) {
-  profiler::TraceMe trace_me(
-      [task] { return strings::StrCat("Schedule:", (*task)->size()); });
   if ((*task)->size() > options_.max_batch_size) {
     return errors::InvalidArgument("Task size ", (*task)->size(),
                                    " is larger than maximum batch size ",
@@ -554,6 +556,10 @@ Status Queue<TaskType>::Schedule(std::unique_ptr<TaskType>* task) {
     if (batches_.back()->empty()) {
       open_batch_start_time_micros_ = env_->NowMicros();
     }
+    profiler::TraceMeProducer trace_me(
+        [&] { return strings::StrCat("Schedule:", (*task)->size()); },
+        profiler::ContextType::kSharedBatchScheduler,
+        batches_.back()->traceme_context_id());
     batches_.back()->AddTask(std::move(*task));
 
     if (!schedulable_batch_) {
@@ -621,8 +627,10 @@ std::unique_ptr<Batch<TaskType>> Queue<TaskType>::ScheduleBatch() {
 
 template <typename TaskType>
 void Queue<TaskType>::ProcessBatch(std::unique_ptr<Batch<TaskType>> batch) {
-  profiler::TraceMe trace_me(
-      [&batch] { return strings::StrCat("ProcessBatch:", batch->size()); });
+  profiler::TraceMeConsumer trace_me(
+      [&batch] { return strings::StrCat("ProcessBatch:", batch->size()); },
+      profiler::ContextType::kSharedBatchScheduler,
+      batch->traceme_context_id());
   process_batch_callback_(std::move(batch));
 
   {
@@ -665,7 +673,7 @@ bool Queue<TaskType>::IsEmptyInternal() const {
 template <typename TaskType>
 void Queue<TaskType>::StartNewBatch() {
   batches_.back()->Close();
-  batches_.emplace_back(new Batch<TaskType>);
+  batches_.emplace_back(new Batch<TaskType>(++traceme_context_id_counter_));
 }
 
 template <typename TaskType>
diff --git a/tensorflow/core/kernels/concat_lib_cpu.cc b/tensorflow/core/kernels/concat_lib_cpu.cc
index da73d3d2c56..1dec589d3ff 100644
--- a/tensorflow/core/kernels/concat_lib_cpu.cc
+++ b/tensorflow/core/kernels/concat_lib_cpu.cc
@@ -116,8 +116,6 @@ REGISTER(qint8)
 REGISTER(quint16)
 REGISTER(qint16)
 REGISTER(qint32)
-REGISTER(uint32)
-REGISTER(uint64)
 
 #if defined(IS_MOBILE_PLATFORM) && !defined(SUPPORT_SELECTIVE_REGISTRATION) && \
     !defined(__ANDROID_TYPES_FULL__)
diff --git a/tensorflow/core/kernels/concat_op.cc b/tensorflow/core/kernels/concat_op.cc
index be3e9a67c5f..d3f3a04f33b 100644
--- a/tensorflow/core/kernels/concat_op.cc
+++ b/tensorflow/core/kernels/concat_op.cc
@@ -208,8 +208,6 @@ REGISTER_CONCAT(qint8);
 REGISTER_CONCAT(quint16);
 REGISTER_CONCAT(qint16);
 REGISTER_CONCAT(qint32);
-REGISTER_CONCAT(uint32);
-REGISTER_CONCAT(uint64);
 
 #undef REGISTER_CONCAT
 
diff --git a/tensorflow/core/kernels/constant_op.cc b/tensorflow/core/kernels/constant_op.cc
index 4bcbc076446..dc178d17d49 100644
--- a/tensorflow/core/kernels/constant_op.cc
+++ b/tensorflow/core/kernels/constant_op.cc
@@ -211,7 +211,6 @@ TF_CALL_ALL_TYPES(REGISTER_CPU_KERNEL);
 // the conversion from uint8 to quint8.
 REGISTER_KERNEL(CPU, quint8);
 REGISTER_KERNEL(CPU, quint16);
-REGISTER_KERNEL(CPU, uint32);
 #undef REGISTER_CPU_KERNEL
 
 #ifdef TENSORFLOW_USE_SYCL
diff --git a/tensorflow/core/kernels/control_flow_ops.cc b/tensorflow/core/kernels/control_flow_ops.cc
index 435de3c5954..1a0082c6a3b 100644
--- a/tensorflow/core/kernels/control_flow_ops.cc
+++ b/tensorflow/core/kernels/control_flow_ops.cc
@@ -101,16 +101,12 @@ TF_CALL_ALL_TYPES(REGISTER_CPU_SWITCH);
 TF_CALL_ALL_TYPES(REGISTER_CPU_REF_SWITCH);
 TF_CALL_QUANTIZED_TYPES(REGISTER_CPU_SWITCH);
 TF_CALL_QUANTIZED_TYPES(REGISTER_CPU_REF_SWITCH);
-REGISTER_CPU_SWITCH(uint64);
 
 TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_SWITCH);
 TF_CALL_QUANTIZED_TYPES(REGISTER_GPU_SWITCH);
 TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_REF_SWITCH);
 TF_CALL_QUANTIZED_TYPES(REGISTER_GPU_REF_SWITCH);
-REGISTER_GPU_SWITCH(uint64);
 TF_CALL_variant(REGISTER_GPU_SWITCH);
-TF_CALL_uint32(REGISTER_GPU_SWITCH);
-TF_CALL_uint32(REGISTER_GPU_REF_SWITCH);
 TF_CALL_bool(REGISTER_GPU_SWITCH);
 TF_CALL_bool(REGISTER_GPU_REF_SWITCH);
 
@@ -311,7 +307,6 @@ TF_CALL_QUANTIZED_TYPES(REGISTER_GPU_KERNEL);
 TF_CALL_QUANTIZED_TYPES(REGISTER_GPU_REF_KERNEL);
 REGISTER_GPU_KERNEL(bool);
 REGISTER_GPU_REF_KERNEL(bool);
-REGISTER_GPU_KERNEL(uint64);
 TF_CALL_variant(REGISTER_GPU_KERNEL);
 
 #undef REGISTER_GPU_KERNEL
diff --git a/tensorflow/core/kernels/conv_grad_input_ops.cc b/tensorflow/core/kernels/conv_grad_input_ops.cc
index fd2f569a8b8..2dd63d1f4d0 100644
--- a/tensorflow/core/kernels/conv_grad_input_ops.cc
+++ b/tensorflow/core/kernels/conv_grad_input_ops.cc
@@ -76,7 +76,7 @@ template <typename T>
 void Col2im(const T* col_data, const int depth, const int height,
             const int width, const int filter_h, const int filter_w,
             const int pad_t, const int pad_l, const int pad_b, const int pad_r,
-            const int stride_h, const int stride_w, T* im_data) {
+            const int stride_h, const int stride_w, T* __restrict im_data) {
   int height_col = (height + pad_t + pad_b - filter_h) / stride_h + 1;
   int width_col = (width + pad_l + pad_r - filter_w) / stride_w + 1;
   int h_pad = -pad_t;
@@ -87,7 +87,6 @@ void Col2im(const T* col_data, const int depth, const int height,
       for (int ih = h_pad; ih < h_pad + filter_h; ++ih) {
         for (int iw = w_pad; iw < w_pad + filter_w; ++iw) {
           if (ih >= 0 && ih < height && iw >= 0 && iw < width) {
-            // TODO(andydavis) Vectorize this loop (if compiler does not).
             for (int i = 0; i < depth; ++i) {
               im_patch_data[i] += col_data[i];
             }
diff --git a/tensorflow/core/kernels/conv_grad_ops_3d.cc b/tensorflow/core/kernels/conv_grad_ops_3d.cc
index 2183d0e0885..322da2537f0 100644
--- a/tensorflow/core/kernels/conv_grad_ops_3d.cc
+++ b/tensorflow/core/kernels/conv_grad_ops_3d.cc
@@ -47,6 +47,7 @@ using stream_executor::dnn::DimIndex;
 #include "tensorflow/core/util/proto/proto_utils.h"
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #if GOOGLE_CUDA
+#include "third_party/gpus/cudnn/cudnn.h"
 #include "tensorflow/stream_executor/gpu/gpu_asm_opts.h"
 #include "tensorflow/stream_executor/gpu/redzone_allocator.h"
 #include "tensorflow/stream_executor/tf_allocator_adapter.h"
@@ -1264,26 +1265,56 @@ class Conv3DBackpropInputOp<GPUDevice, T> : public OpKernel {
     CHECK(padding_rows >= 0 && padding_cols >= 0 && padding_planes >= 0)
         << "Negative paddings: (" << padding_rows << ", " << padding_cols
         << ", " << padding_planes << ")";
+
+#if GOOGLE_CUDA
+    const bool compute_in_nhwc =
+        CUDNN_VERSION >= 8000 && DataTypeToEnum<T>::value == DT_HALF;
+#else
+    // fast NDHWC implementation is a CUDA only feature
+    const bool compute_in_nhwc = false;
+#endif
+    const TensorFormat compute_data_format =
+        (compute_in_nhwc && data_format_ == FORMAT_NHWC) ? FORMAT_NHWC
+                                                         : FORMAT_NCHW;
+
+    VLOG(3) << "Compute Conv3DBackpropInput with cuDNN:"
+            << " data_format=" << ToString(data_format_)
+            << " compute_data_format=" << ToString(compute_data_format);
+
+    constexpr auto kComputeInNHWC =
+        std::make_tuple(se::dnn::DataLayout::kBatchYXDepth,
+                        se::dnn::FilterLayout::kOutputYXInput);
+    constexpr auto kComputeInNCHW =
+        std::make_tuple(se::dnn::DataLayout::kBatchDepthYX,
+                        se::dnn::FilterLayout::kOutputInputYX);
+
+    se::dnn::DataLayout compute_data_layout;
+    se::dnn::FilterLayout filter_layout;
+
+    std::tie(compute_data_layout, filter_layout) =
+        compute_data_format == FORMAT_NHWC ? kComputeInNHWC : kComputeInNCHW;
+
     se::dnn::BatchDescriptor input_desc(3);
     input_desc.set_count(dims.batch_size)
         .set_spatial_dim(DimIndex::X, compatible_input_shape.dim_size(4))
         .set_spatial_dim(DimIndex::Y, compatible_input_shape.dim_size(3))
         .set_spatial_dim(DimIndex::Z, compatible_input_shape.dim_size(2))
         .set_feature_map_count(dims.in_depth)
-        .set_layout(se::dnn::DataLayout::kBatchDepthYX);
+        .set_layout(compute_data_layout);
     se::dnn::BatchDescriptor output_desc(3);
     output_desc.set_count(dims.batch_size)
         .set_spatial_dim(DimIndex::X, dims.output_size(2))
         .set_spatial_dim(DimIndex::Y, dims.output_size(1))
         .set_spatial_dim(DimIndex::Z, dims.output_size(0))
         .set_feature_map_count(dims.out_depth)
-        .set_layout(se::dnn::DataLayout::kBatchDepthYX);
+        .set_layout(compute_data_layout);
     se::dnn::FilterDescriptor filter_desc(3);
     filter_desc.set_spatial_dim(DimIndex::X, dims.filter_size(2))
         .set_spatial_dim(DimIndex::Y, dims.filter_size(1))
         .set_spatial_dim(DimIndex::Z, dims.filter_size(0))
         .set_input_feature_map_count(filter_shape.dim_size(3))
-        .set_output_feature_map_count(filter_shape.dim_size(4));
+        .set_output_feature_map_count(filter_shape.dim_size(4))
+        .set_layout(filter_layout);
     se::dnn::ConvolutionDescriptor conv_desc(3);
     conv_desc.set_dilation_rate(DimIndex::X, dims.dilation(2))
         .set_dilation_rate(DimIndex::Y, dims.dilation(1))
@@ -1298,21 +1329,28 @@ class Conv3DBackpropInputOp<GPUDevice, T> : public OpKernel {
 
     // Shape: out, in, z, y, x.
     Tensor transformed_filter;
-    OP_REQUIRES_OK(
-        context, context->allocate_temp(
-                     DataTypeToEnum<T>::value,
-                     TensorShape({filter_shape.dim_size(4),
-                                  filter_shape.dim_size(3), dims.filter_size(0),
-                                  dims.filter_size(1), dims.filter_size(2)}),
-                     &transformed_filter));
+    auto dst_format =
+        compute_data_format == FORMAT_NCHW ? FORMAT_OIHW : FORMAT_OHWI;
+    TensorShape dst_shape =
+        dst_format == FORMAT_OIHW
+            ? TensorShape({filter_shape.dim_size(4), filter_shape.dim_size(3),
+                           dims.filter_size(0), dims.filter_size(1),
+                           dims.filter_size(2)})
+            : TensorShape({filter_shape.dim_size(4), dims.filter_size(0),
+                           dims.filter_size(1), dims.filter_size(2),
+                           filter_shape.dim_size(3)});
+    OP_REQUIRES_OK(context,
+                   context->allocate_temp(DataTypeToEnum<T>::value, dst_shape,
+                                          &transformed_filter));
+
     functor::TransformFilter<GPUDevice, T, int, 5>()(
-        context->eigen_device<GPUDevice>(), FORMAT_OIHW,
+        context->eigen_device<GPUDevice>(), dst_format,
         To32Bit(filter.tensor<T, 5>()),
         To32Bit(transformed_filter.tensor<T, 5>()));
 
     // Shape: batch, filters, z, y, x.
     Tensor transformed_out_backprop;
-    if (data_format_ == FORMAT_NHWC) {
+    if (data_format_ == FORMAT_NHWC && compute_data_format == FORMAT_NCHW) {
       TensorShape nchw_shape = {dims.batch_size, dims.out_depth,
                                 dims.output_size(0), dims.output_size(1),
                                 dims.output_size(2)};
@@ -1331,10 +1369,16 @@ class Conv3DBackpropInputOp<GPUDevice, T> : public OpKernel {
     }
     // Shape: batch, filters, z, y, x.
     Tensor pre_transformed_in_backprop;
-    OP_REQUIRES_OK(
-        context,
-        context->allocate_temp(DataTypeToEnum<T>::value, compatible_input_shape,
-                               &pre_transformed_in_backprop));
+    OP_REQUIRES_OK(context,
+                   context->allocate_temp(
+                       DataTypeToEnum<T>::value,
+                       ShapeFromFormat(compute_data_format,
+                                       compatible_input_shape.dim_size(0),
+                                       {{compatible_input_shape.dim_size(2),
+                                         compatible_input_shape.dim_size(3),
+                                         compatible_input_shape.dim_size(4)}},
+                                       compatible_input_shape.dim_size(1)),
+                       &pre_transformed_in_backprop));
 
     auto out_backprop_ptr =
         AsDeviceMemory(transformed_out_backprop.template flat<T>().data(),
@@ -1355,7 +1399,7 @@ class Conv3DBackpropInputOp<GPUDevice, T> : public OpKernel {
         dims.batch_size,
         dims.in_depth,
         {{dims.input_size(0), dims.input_size(1), dims.input_size(2)}},
-        FORMAT_NCHW,
+        compute_data_format,
         dims.out_depth,
         {{dims.filter_size(0), dims.filter_size(1), dims.filter_size(2)}},
         {{dims.dilation(0), dims.dilation(1), dims.dilation(2)}},
@@ -1497,12 +1541,14 @@ class Conv3DBackpropInputOp<GPUDevice, T> : public OpKernel {
 
     if (rows_odd || cols_odd || planes_odd) {
       Tensor in_backprop_remove_padding;
-      OP_REQUIRES_OK(context,
-                     context->allocate_temp(
-                         DataTypeToEnum<T>::value,
-                         {dims.batch_size, dims.in_depth, dims.input_size(0),
-                          dims.input_size(1), dims.input_size(2)},
-                         &in_backprop_remove_padding));
+      OP_REQUIRES_OK(
+          context, context->allocate_temp(
+                       DataTypeToEnum<T>::value,
+                       ShapeFromFormat(compute_data_format, dims.batch_size,
+                                       {{dims.input_size(0), dims.input_size(1),
+                                         dims.input_size(2)}},
+                                       dims.in_depth),
+                       &in_backprop_remove_padding));
 
       // Remove the padding for odd spatial dimensions.
       functor::PadInput<GPUDevice, T, int, 5>()(
@@ -1510,12 +1556,13 @@ class Conv3DBackpropInputOp<GPUDevice, T> : public OpKernel {
           To32Bit(const_cast<const Tensor&>(pre_transformed_in_backprop)
                       .tensor<T, 5>()),
           {{0, 0, 0}}, {{-planes_odd, -rows_odd, -cols_odd}},
-          To32Bit(in_backprop_remove_padding.tensor<T, 5>()), FORMAT_NCHW);
+          To32Bit(in_backprop_remove_padding.tensor<T, 5>()),
+          compute_data_format);
 
       pre_transformed_in_backprop = in_backprop_remove_padding;
     }
 
-    if (data_format_ == FORMAT_NHWC) {
+    if (data_format_ == FORMAT_NHWC && compute_data_format == FORMAT_NCHW) {
       auto toConstTensor = [](const Tensor& x) -> const Tensor { return x; };
       functor::NCHWToNHWC<GPUDevice, T, 5>()(
           context->eigen_device<GPUDevice>(),
@@ -1723,6 +1770,35 @@ class Conv3DBackpropFilterOp<GPUDevice, T> : public OpKernel {
     CHECK(padding_rows >= 0 && padding_cols >= 0 && padding_planes >= 0)
         << "Negative paddings: (" << padding_rows << ", " << padding_cols
         << ", " << padding_planes << ")";
+
+#if GOOGLE_CUDA
+    const bool compute_in_nhwc =
+        CUDNN_VERSION >= 8000 && DataTypeToEnum<T>::value == DT_HALF;
+#else
+    // fast NDHWC implementation is a CUDA only feature
+    const bool compute_in_nhwc = false;
+#endif
+    const TensorFormat compute_data_format =
+        (compute_in_nhwc && data_format_ == FORMAT_NHWC) ? FORMAT_NHWC
+                                                         : FORMAT_NCHW;
+
+    VLOG(3) << "Compute Conv3DBackpropFilter with cuDNN:"
+            << " data_format=" << ToString(data_format_)
+            << " compute_data_format=" << ToString(compute_data_format);
+
+    constexpr auto kComputeInNHWC =
+        std::make_tuple(se::dnn::DataLayout::kBatchYXDepth,
+                        se::dnn::FilterLayout::kOutputYXInput);
+    constexpr auto kComputeInNCHW =
+        std::make_tuple(se::dnn::DataLayout::kBatchDepthYX,
+                        se::dnn::FilterLayout::kOutputInputYX);
+
+    se::dnn::DataLayout compute_data_layout;
+    se::dnn::FilterLayout filter_layout;
+
+    std::tie(compute_data_layout, filter_layout) =
+        compute_data_format == FORMAT_NHWC ? kComputeInNHWC : kComputeInNCHW;
+
     se::dnn::BatchDescriptor input_desc(3);
     input_desc.set_count(dims.batch_size)
         .set_spatial_dim(DimIndex::X,
@@ -1732,20 +1808,21 @@ class Conv3DBackpropFilterOp<GPUDevice, T> : public OpKernel {
         .set_spatial_dim(DimIndex::Z,
                          GetTensorDim(compatible_input, data_format_, '0'))
         .set_feature_map_count(dims.in_depth)
-        .set_layout(se::dnn::DataLayout::kBatchDepthYX);
+        .set_layout(compute_data_layout);
     se::dnn::BatchDescriptor output_desc(3);
     output_desc.set_count(dims.batch_size)
         .set_spatial_dim(DimIndex::X, dims.output_size(2))
         .set_spatial_dim(DimIndex::Y, dims.output_size(1))
         .set_spatial_dim(DimIndex::Z, dims.output_size(0))
         .set_feature_map_count(dims.out_depth)
-        .set_layout(se::dnn::DataLayout::kBatchDepthYX);
+        .set_layout(compute_data_layout);
     se::dnn::FilterDescriptor filter_desc(3);
     filter_desc.set_spatial_dim(DimIndex::X, dims.filter_size(2))
         .set_spatial_dim(DimIndex::Y, dims.filter_size(1))
         .set_spatial_dim(DimIndex::Z, dims.filter_size(0))
         .set_input_feature_map_count(filter_shape.dim_size(3))
-        .set_output_feature_map_count(filter_shape.dim_size(4));
+        .set_output_feature_map_count(filter_shape.dim_size(4))
+        .set_layout(filter_layout);
     se::dnn::ConvolutionDescriptor conv_desc(3);
     conv_desc.set_dilation_rate(DimIndex::X, dims.dilation(2))
         .set_dilation_rate(DimIndex::Y, dims.dilation(1))
@@ -1757,17 +1834,25 @@ class Conv3DBackpropFilterOp<GPUDevice, T> : public OpKernel {
         .set_zero_padding(DimIndex::Y, padding_rows / 2)
         .set_zero_padding(DimIndex::Z, padding_planes / 2)
         .set_group_count(dims.in_depth / filter_shape.dim_size(3));
+
     Tensor pre_transformed_filter_backprop;
-    OP_REQUIRES_OK(
-        context, context->allocate_temp(
-                     DataTypeToEnum<T>::value,
-                     TensorShape({filter_shape.dim_size(4),
-                                  filter_shape.dim_size(3), dims.filter_size(0),
-                                  dims.filter_size(1), dims.filter_size(2)}),
-                     &pre_transformed_filter_backprop));
+    auto dst_format =
+        compute_data_format == FORMAT_NCHW ? FORMAT_OIHW : FORMAT_OHWI;
+    TensorShape dst_shape =
+        dst_format == FORMAT_OIHW
+            ? TensorShape({filter_shape.dim_size(4), filter_shape.dim_size(3),
+                           dims.filter_size(0), dims.filter_size(1),
+                           dims.filter_size(2)})
+            : TensorShape({filter_shape.dim_size(4), dims.filter_size(0),
+                           dims.filter_size(1), dims.filter_size(2),
+                           filter_shape.dim_size(3)});
+    OP_REQUIRES_OK(context,
+                   context->allocate_temp(DataTypeToEnum<T>::value, dst_shape,
+                                          &pre_transformed_filter_backprop));
 
     Tensor transformed_out_backprop;
-    if (data_format_ == FORMAT_NHWC) {
+    if (data_format_ == FORMAT_NHWC && compute_data_format == FORMAT_NCHW) {
+      VLOG(4) << "Convert the `out_backprop` tensor from NDHWC to NCDHW.";
       TensorShape nchw_shape = {dims.batch_size, dims.out_depth,
                                 dims.output_size(0), dims.output_size(1),
                                 dims.output_size(2)};
@@ -1785,7 +1870,8 @@ class Conv3DBackpropFilterOp<GPUDevice, T> : public OpKernel {
       transformed_out_backprop = out_backprop;
     }
     Tensor transformed_input;
-    if (data_format_ == FORMAT_NHWC) {
+    if (data_format_ == FORMAT_NHWC && compute_data_format == FORMAT_NCHW) {
+      VLOG(4) << "Convert the `input` tensor from NDHWC to NCDHW.";
       TensorShape nchw_shape = {
           dims.batch_size, dims.in_depth, compatible_input.dim_size(1),
           compatible_input.dim_size(2), compatible_input.dim_size(3)};
@@ -1823,7 +1909,7 @@ class Conv3DBackpropFilterOp<GPUDevice, T> : public OpKernel {
         dims.batch_size,
         dims.in_depth,
         {{dims.input_size(0), dims.input_size(1), dims.input_size(2)}},
-        FORMAT_NCHW,
+        compute_data_format,
         dims.out_depth,
         {{dims.filter_size(0), dims.filter_size(1), dims.filter_size(2)}},
         {{dims.dilation(0), dims.dilation(1), dims.dilation(2)}},
@@ -1947,7 +2033,7 @@ class Conv3DBackpropFilterOp<GPUDevice, T> : public OpKernel {
 
     auto toConstTensor = [](const Tensor& x) -> const Tensor { return x; };
     functor::ReverseTransformFilter<GPUDevice, T, 5>()(
-        context->eigen_device<GPUDevice>(), /*src_filter_format=*/FORMAT_OIHW,
+        context->eigen_device<GPUDevice>(), /*src_filter_format=*/dst_format,
         toConstTensor(pre_transformed_filter_backprop).template tensor<T, 5>(),
         filter_backprop->tensor<T, 5>());
   }
diff --git a/tensorflow/core/kernels/conv_ops_3d.cc b/tensorflow/core/kernels/conv_ops_3d.cc
index 69e6fba4192..289a083acfb 100644
--- a/tensorflow/core/kernels/conv_ops_3d.cc
+++ b/tensorflow/core/kernels/conv_ops_3d.cc
@@ -40,6 +40,7 @@ limitations under the License.
 using stream_executor::dnn::DimIndex;
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #if GOOGLE_CUDA
+#include "third_party/gpus/cudnn/cudnn.h"
 #include "tensorflow/stream_executor/gpu/asm_compiler.h"
 #include "tensorflow/stream_executor/gpu/redzone_allocator.h"
 #include "tensorflow/stream_executor/tf_allocator_adapter.h"
@@ -201,7 +202,23 @@ struct LaunchConvOp<GPUDevice, T, OpKernelContext> {
       }
     }
 
-    if (data_format == FORMAT_NHWC) {
+#if GOOGLE_CUDA
+    const bool compute_in_nhwc =
+        CUDNN_VERSION >= 8000 && DataTypeToEnum<T>::value == DT_HALF;
+#else
+    // fast NHWC implementation is a CUDA only feature
+    const bool compute_in_nhwc = false;
+#endif
+    const TensorFormat compute_data_format =
+        (compute_in_nhwc && data_format == FORMAT_NHWC) ? FORMAT_NHWC
+                                                        : FORMAT_NCHW;
+
+    VLOG(3) << "Compute Conv3D with cuDNN:"
+            << " data_format=" << ToString(data_format)
+            << " compute_data_format=" << ToString(compute_data_format);
+
+    if (data_format == FORMAT_NHWC && compute_data_format == FORMAT_NCHW) {
+      VLOG(4) << "Convert the input tensor from NDHWC to NCDHW.";
       const TensorShape nchw_shape = ShapeFromFormat(
           FORMAT_NCHW, in_batch, {{in_planes, in_rows, in_cols}}, in_depth);
       if (in_depth > 1) {
@@ -219,8 +236,26 @@ struct LaunchConvOp<GPUDevice, T, OpKernelContext> {
       } else {
         CHECK(input.CopyFrom(input, nchw_shape));
       }
+    } else {
+      CHECK(data_format == compute_data_format)  // Crash OK
+          << "Illegal data and compute format pair:"
+          << " data_format=" << ToString(data_format)
+          << " compute_data_format=" << ToString(compute_data_format);
     }
 
+    constexpr auto kComputeInNHWC =
+        std::make_tuple(se::dnn::DataLayout::kBatchYXDepth,
+                        se::dnn::FilterLayout::kOutputYXInput);
+    constexpr auto kComputeInNCHW =
+        std::make_tuple(se::dnn::DataLayout::kBatchDepthYX,
+                        se::dnn::FilterLayout::kOutputInputYX);
+
+    se::dnn::DataLayout compute_data_layout;
+    se::dnn::FilterLayout filter_layout;
+
+    std::tie(compute_data_layout, filter_layout) =
+        compute_data_format == FORMAT_NHWC ? kComputeInNHWC : kComputeInNCHW;
+
     CHECK(pad_rows >= 0 && pad_cols >= 0 && pad_planes >= 0)
         << "Negative paddings: (" << pad_rows << ", " << pad_cols << ", "
         << pad_planes << ")";
@@ -230,20 +265,21 @@ struct LaunchConvOp<GPUDevice, T, OpKernelContext> {
         .set_spatial_dim(DimIndex::X, in_cols)
         .set_spatial_dim(DimIndex::Y, in_rows)
         .set_spatial_dim(DimIndex::Z, in_planes)
-        .set_layout(se::dnn::DataLayout::kBatchDepthYX);
+        .set_layout(compute_data_layout);
     se::dnn::BatchDescriptor output_desc(3);
     output_desc.set_count(in_batch)
         .set_spatial_dim(DimIndex::X, out_cols)
         .set_spatial_dim(DimIndex::Y, out_rows)
         .set_spatial_dim(DimIndex::Z, out_planes)
         .set_feature_map_count(out_depth)
-        .set_layout(se::dnn::DataLayout::kBatchDepthYX);
+        .set_layout(compute_data_layout);
     se::dnn::FilterDescriptor filter_desc(3);
     filter_desc.set_spatial_dim(DimIndex::X, filter_cols)
         .set_spatial_dim(DimIndex::Y, filter_rows)
         .set_spatial_dim(DimIndex::Z, filter_planes)
         .set_input_feature_map_count(filter_depth)
-        .set_output_feature_map_count(out_depth);
+        .set_output_feature_map_count(out_depth)
+        .set_layout(filter_layout);
     se::dnn::ConvolutionDescriptor conv_desc(3);
     conv_desc.set_dilation_rate(DimIndex::X, dilations[2])
         .set_dilation_rate(DimIndex::Y, dilations[1])
@@ -257,25 +293,41 @@ struct LaunchConvOp<GPUDevice, T, OpKernelContext> {
         .set_group_count(in_depth / filter_depth);
 
     Tensor transformed_filter;
-    OP_REQUIRES_OK(
-        ctx, ctx->allocate_temp(DataTypeToEnum<T>::value,
-                                TensorShape({out_depth, in_depth, filter_planes,
-                                             filter_rows, filter_cols}),
-                                &transformed_filter));
+    auto dst_format =
+        compute_data_format == FORMAT_NCHW ? FORMAT_OIHW : FORMAT_OHWI;
+    VLOG(4) << "Transform filter tensor from " << ToString(FORMAT_HWIO)
+            << " to " << ToString(dst_format);
+    TensorShape dst_shape =
+        dst_format == FORMAT_OIHW
+            ? TensorShape({filter.dim_size(4), filter.dim_size(3),
+                           filter.dim_size(0), filter.dim_size(1),
+                           filter.dim_size(2)})
+            : TensorShape({filter.dim_size(4), filter.dim_size(0),
+                           filter.dim_size(1), filter.dim_size(2),
+                           filter.dim_size(3)});
+    OP_REQUIRES_OK(ctx, ctx->allocate_temp(DataTypeToEnum<T>::value, dst_shape,
+                                           &transformed_filter));
     // filter: [x, y, z, in, out]
-    // t_filter: [out, in, x, y, z]
+    // t_filter: [out, in, x, y, z] (NCDHW) or
+    // t_filter: [out, x, y, z, in] (NDHWC)
     functor::TransformFilter<GPUDevice, T, int, 5>()(
-        ctx->eigen_device<GPUDevice>(), FORMAT_OIHW,
+        ctx->eigen_device<GPUDevice>(), dst_format,
         To32Bit(filter.tensor<T, 5>()),
         To32Bit(transformed_filter.tensor<T, 5>()));
 
     Tensor transformed_output;
-    OP_REQUIRES_OK(
-        ctx, ctx->allocate_temp(
-                 DataTypeToEnum<T>::value,
-                 ShapeFromFormat(FORMAT_NCHW, in_batch,
-                                 {{out_planes, out_rows, out_cols}}, out_depth),
-                 &transformed_output));
+    if (data_format != compute_data_format) {
+      VLOG(4) << "Allocate temporary memory for output in compute data format";
+      OP_REQUIRES_OK(
+          ctx,
+          ctx->allocate_temp(
+              DataTypeToEnum<T>::value,
+              ShapeFromFormat(FORMAT_NCHW, in_batch,
+                              {{out_planes, out_rows, out_cols}}, out_depth),
+              &transformed_output));
+    } else {
+      transformed_output = *output;
+    }
 
     auto input_ptr = AsDeviceMemory(input.template flat<T>().data(),
                                     input.template flat<T>().size());
@@ -295,7 +347,7 @@ struct LaunchConvOp<GPUDevice, T, OpKernelContext> {
         in_batch,
         in_depth,
         {{in_planes, in_rows, in_cols}},
-        FORMAT_NCHW,
+        compute_data_format,
         out_depth,
         {{filter_planes, filter_rows, filter_cols}},
         {{dilations[0], dilations[1], dilations[2]}},
@@ -455,15 +507,14 @@ struct LaunchConvOp<GPUDevice, T, OpKernelContext> {
           ") filter shape(", filter.shape().DebugString(), ")"));
     }
 
-    if (data_format == FORMAT_NHWC) {
+    if (data_format == FORMAT_NHWC && compute_data_format == FORMAT_NCHW) {
+      VLOG(4) << "Convert the output tensor back from NCDHW to NDHWC.";
       // t_output: [b, out, x, y, z]
       // output: [b, x, y, z, out]
       functor::NCHWToNHWC<GPUDevice, T, 5>()(
           ctx->eigen_device<GPUDevice>(),
           const_cast<const Tensor&>(transformed_output).tensor<T, 5>(),
           output->tensor<T, 5>());
-    } else {
-      *output = transformed_output;
     }
   }
 };
diff --git a/tensorflow/core/kernels/cubin_headers/BUILD b/tensorflow/core/kernels/cubin_headers/BUILD
index b8ba164fbc3..ec8b44050db 100644
--- a/tensorflow/core/kernels/cubin_headers/BUILD
+++ b/tensorflow/core/kernels/cubin_headers/BUILD
@@ -37,4 +37,5 @@ gen_kernel_library(
         "f32",
         "f64",
     ],
+    unroll_factors = "4",
 )
diff --git a/tensorflow/core/kernels/cubin_headers/build_defs.bzl b/tensorflow/core/kernels/cubin_headers/build_defs.bzl
index f0f4a944e74..c3e44b7a974 100644
--- a/tensorflow/core/kernels/cubin_headers/build_defs.bzl
+++ b/tensorflow/core/kernels/cubin_headers/build_defs.bzl
@@ -5,7 +5,7 @@ load("@local_config_cuda//cuda:build_defs.bzl", "cuda_gpu_architectures", "if_cu
 def _lookup_file(filegroup, path):
     """Extracts file at (relative) path in filegroup."""
     for file in filegroup.files.to_list():
-        if file.path.endswith(path):
+        if file.path.endswith(path) or file.path.endswith(path + ".exe"):
             return file
     return None
 
@@ -15,9 +15,11 @@ def _gen_kernel_image_hdr_impl(ctx):
 
     name = ctx.attr.name
     tile_sizes = ctx.attr.tile_size.replace("x", ",")
-    same_shape = []
+    cmd_args = []
     if ctx.attr.same_shape:
-        same_shape.append("--same_shape=%s" % ctx.attr.same_shape)
+        cmd_args.append("--same_shape=%s" % ctx.attr.same_shape)
+    if ctx.attr.unroll_factors:
+        cmd_args.append("--unroll_factors=%s" % ctx.attr.unroll_factors)
 
     cubins = []
     images = []
@@ -30,7 +32,7 @@ def _gen_kernel_image_hdr_impl(ctx):
             inputs = [ctx.file.mlir_op],
             outputs = [cubin],
             executable = ctx.executable._tool,
-            arguments = same_shape + [
+            arguments = cmd_args + [
                 "--tile_sizes=%s" % tile_sizes,
                 "--arch=%s" % arch.split("_")[1],
                 "--input=%s" % ctx.file.mlir_op.path,
@@ -74,6 +76,7 @@ _gen_kernel_image_hdr_rule = rule(
         "mlir_op": attr.label(mandatory = True, allow_single_file = True),
         "tile_size": attr.string(mandatory = True),
         "same_shape": attr.string(),
+        "unroll_factors": attr.string(),
         "out": attr.output(mandatory = True),
         "symbol": attr.string(mandatory = True),
         "gpu_archs": attr.string_list(mandatory = True),
@@ -88,7 +91,7 @@ _gen_kernel_image_hdr_rule = rule(
     },
 )
 
-def _gen_kernel_image_hdr(name, mlir_op, tile_size, tags = [], same_shape = None):
+def _gen_kernel_image_hdr(name, mlir_op, tile_size, tags = [], same_shape = None, unroll_factors = None):
     """Generates a C header with fatbin data from a Tensorflow op."""
     if cuda_gpu_architectures():
         _gen_kernel_image_hdr_rule(
@@ -96,6 +99,7 @@ def _gen_kernel_image_hdr(name, mlir_op, tile_size, tags = [], same_shape = None
             mlir_op = mlir_op,
             tile_size = tile_size,
             same_shape = same_shape,
+            unroll_factors = unroll_factors,
             out = "%s.h" % name,
             symbol = "k%s" % name.replace("_", " ").title().replace(" ", ""),
             gpu_archs = cuda_gpu_architectures(),
@@ -131,13 +135,14 @@ def _gen_mlir_op(name, type):
         out = "{name}_{type}.mlir".format(name = name, type = type),
     )
 
-def gen_kernel_library(name, types, tile_size, tags = [], same_shape = None):
+def gen_kernel_library(name, types, tile_size, tags = [], same_shape = None, unroll_factors = None):
     """ Generate a library with kernels for a specific tensorflow op.
 
     Args:
       name: The name of the tensorflow op.
       types: The types ("f16", "f32", "f64") for which a kernel should be generated.
       tile_size: The tiling specification, e.g. "16x16".
+      unroll_factors: The unrolling specification, e.g. "4,4"
       tags: The tags which should be added to the library.
       same_shape: The information about which shapes are the same, e.g. "0,1".
     """
@@ -154,6 +159,7 @@ def gen_kernel_library(name, types, tile_size, tags = [], same_shape = None):
                 tile_size = tile_size,
                 tags = tags,
                 same_shape = same_shape,
+                unroll_factors = unroll_factors,
             )
 
     native.cc_library(
diff --git a/tensorflow/core/kernels/cudnn_pooling_gpu.cc b/tensorflow/core/kernels/cudnn_pooling_gpu.cc
index eb7d16e3074..60088133d5d 100644
--- a/tensorflow/core/kernels/cudnn_pooling_gpu.cc
+++ b/tensorflow/core/kernels/cudnn_pooling_gpu.cc
@@ -98,10 +98,25 @@ void DnnPooling3dOp<T>::Compute(OpKernelContext* context,
   auto* stream = context->op_device_context()->stream();
   OP_REQUIRES(context, stream, errors::Internal("No GPU stream available."));
 
+#if TENSORFLOW_USE_ROCM
+  static int64 PoolingScratchSize = GetDnnWorkspaceLimit(
+      // default value is in bytes despite the name of the environment variable
+      "TF_CUDNN_WORKSPACE_LIMIT_IN_MB", 1LL << 32  // 4GB
+  );
+
+  DnnScratchAllocator scratch_allocator(PoolingScratchSize, context);
+  bool status =
+      stream
+          ->ThenPoolForward(pooling_desc, input_desc, input_data, output_desc,
+                            &output_data, &scratch_allocator)
+          .ok();
+#else
   bool status = stream
                     ->ThenPoolForward(pooling_desc, input_desc, input_data,
                                       output_desc, &output_data)
                     .ok();
+#endif
+
   OP_REQUIRES(context, status,
               errors::Internal("dnn PoolForward launch failed"));
 
@@ -225,12 +240,28 @@ void DnnPooling3dGradOp<T>::Compute(
   auto* stream = context->op_device_context()->stream();
   OP_REQUIRES(context, stream, errors::Internal("No GPU stream available."));
 
+#if TENSORFLOW_USE_ROCM
+  static int64 PoolingScratchSize = GetDnnWorkspaceLimit(
+      // default value is in bytes despite the name of the environment variable
+      "TF_CUDNN_WORKSPACE_LIMIT_IN_MB", 1LL << 32  // 4GB
+  );
+
+  DnnScratchAllocator scratch_allocator(PoolingScratchSize, context);
+  bool status = stream
+                    ->ThenPoolBackward(pooling_desc, orig_input_desc,
+                                       orig_input_data, orig_output_desc,
+                                       orig_output_data, output_backprop_data,
+                                       &input_backprop_data, &scratch_allocator)
+                    .ok();
+#else
   bool status =
       stream
           ->ThenPoolBackward(pooling_desc, orig_input_desc, orig_input_data,
                              orig_output_desc, orig_output_data,
                              output_backprop_data, &input_backprop_data)
           .ok();
+#endif
+
   OP_REQUIRES(context, status,
               errors::Internal("dnn PoolBackward launch failed"));
 
diff --git a/tensorflow/core/kernels/cwise_op_gpu_neg.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_neg.cu.cc
index ea1ca623560..4f7bb9b2075 100644
--- a/tensorflow/core/kernels/cwise_op_gpu_neg.cu.cc
+++ b/tensorflow/core/kernels/cwise_op_gpu_neg.cu.cc
@@ -19,8 +19,8 @@ limitations under the License.
 
 namespace tensorflow {
 namespace functor {
-DEFINE_UNARY7(neg, Eigen::half, float, double, int32, int64, complex64,
-              complex128);
+DEFINE_UNARY4(neg, int8, int16, int32, int64);
+DEFINE_UNARY6(neg, Eigen::half, float, double, bfloat16, complex64, complex128);
 }  // namespace functor
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/cwise_op_neg.cc b/tensorflow/core/kernels/cwise_op_neg_1.cc
similarity index 87%
rename from tensorflow/core/kernels/cwise_op_neg.cc
rename to tensorflow/core/kernels/cwise_op_neg_1.cc
index f52cf6c8b91..18a7c61be90 100644
--- a/tensorflow/core/kernels/cwise_op_neg.cc
+++ b/tensorflow/core/kernels/cwise_op_neg_1.cc
@@ -16,8 +16,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_common.h"
 
 namespace tensorflow {
-REGISTER8(UnaryOp, CPU, "Neg", functor::neg, float, Eigen::half, double, int32,
-          complex64, int64, complex128, bfloat16);
+REGISTER4(UnaryOp, CPU, "Neg", functor::neg, int8, int16, int32, int64);
 
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER3(UnaryOp, SYCL, "Neg", functor::neg, float, double, int64);
@@ -30,8 +29,7 @@ REGISTER_KERNEL_BUILDER(Name("Neg")
 #endif  // TENSORFLOW_USE_SYCL
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-REGISTER6(UnaryOp, GPU, "Neg", functor::neg, float, Eigen::half, double, int64,
-          complex64, complex128);
+REGISTER3(UnaryOp, GPU, "Neg", functor::neg, int8, int16, int64);
 
 // A special GPU kernel for int32.
 // TODO(b/25387198): Also enable int32 in device memory. This kernel
diff --git a/tensorflow/core/kernels/cwise_op_bessel.cu.cc b/tensorflow/core/kernels/cwise_op_neg_2.cc
similarity index 66%
rename from tensorflow/core/kernels/cwise_op_bessel.cu.cc
rename to tensorflow/core/kernels/cwise_op_neg_2.cc
index 3d47dddcdcb..5ea78ad665c 100644
--- a/tensorflow/core/kernels/cwise_op_bessel.cu.cc
+++ b/tensorflow/core/kernels/cwise_op_neg_2.cc
@@ -1,4 +1,4 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,15 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-
-#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h"
+#include "tensorflow/core/kernels/cwise_ops_common.h"
 
 namespace tensorflow {
-namespace functor {
-DEFINE_UNARY3(bessel_i0e, Eigen::half, float, double);
-DEFINE_UNARY3(bessel_i1e, Eigen::half, float, double);
-}  // namespace functor
-}  // namespace tensorflow
+REGISTER6(UnaryOp, CPU, "Neg", functor::neg, Eigen::half, float, double,
+          bfloat16, complex64, complex128);
 
-#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+REGISTER6(UnaryOp, GPU, "Neg", functor::neg, Eigen::half, float, double,
+          bfloat16, complex64, complex128);
+#endif
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_ops.h b/tensorflow/core/kernels/cwise_ops.h
index b8bf19c2cec..88651d7bfdc 100644
--- a/tensorflow/core/kernels/cwise_ops.h
+++ b/tensorflow/core/kernels/cwise_ops.h
@@ -943,12 +943,6 @@ struct acos : base<T, Eigen::internal::scalar_acos_op<T>> {};
 template <typename T>
 struct atan : base<T, Eigen::internal::scalar_atan_op<T>> {};
 
-template <typename T>
-struct bessel_i0e : base<T, Eigen::internal::scalar_bessel_i0e_op<T>> {};
-
-template <typename T>
-struct bessel_i1e : base<T, Eigen::internal::scalar_bessel_i1e_op<T>> {};
-
 struct logical_not : base<bool, Eigen::internal::scalar_boolean_not_op<bool>> {
 };
 
diff --git a/tensorflow/core/kernels/data/BUILD b/tensorflow/core/kernels/data/BUILD
index 6d0351202df..0972dc83ccf 100644
--- a/tensorflow/core/kernels/data/BUILD
+++ b/tensorflow/core/kernels/data/BUILD
@@ -674,6 +674,8 @@ tf_kernel_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/profiler/lib:traceme",
+        "//tensorflow/core/profiler/lib:traceme_encode",
     ],
 )
 
diff --git a/tensorflow/core/kernels/data/captured_function.cc b/tensorflow/core/kernels/data/captured_function.cc
index f740d7ff1ad..07e5a5b1273 100644
--- a/tensorflow/core/kernels/data/captured_function.cc
+++ b/tensorflow/core/kernels/data/captured_function.cc
@@ -571,6 +571,9 @@ Status CapturedFunction::Instantiate(
   DCHECK(lib->device() != nullptr);
   inst_opts.target = lib->device()->name();
 
+  // Maps from a CompositeDevice name to underlying physical device names.
+  absl::flat_hash_map<string, std::vector<string>> composite_devices;
+
   if (inst_opts.is_multi_device_function) {
     // Compute devices of non-captured inputs.
     //
@@ -596,9 +599,29 @@ Status CapturedFunction::Instantiate(
       const auto& input = captured_inputs_[i];
       DataType dtype = input.dtype();
       if (dtype == DT_RESOURCE) {
-        const ResourceHandle& handle = input.flat<ResourceHandle>()(0);
-        inst_opts.input_devices.push_back(handle.device());
-        const auto& dtypes_and_shapes = handle.dtypes_and_shapes();
+        const auto& handles = input.flat<ResourceHandle>();
+        const ResourceHandle& handle0 = handles(0);
+        string composite_device;
+        auto iter = fdef->arg_attr().find(num_non_captured_inputs + i);
+        if (iter != fdef->arg_attr().end()) {
+          auto arg_attr = iter->second.attr().find("_composite_device");
+          if (arg_attr != iter->second.attr().end()) {
+            composite_device = arg_attr->second.s();
+          }
+        }
+        if (!composite_device.empty()) {
+          if (composite_devices.find(composite_device) ==
+              composite_devices.end()) {
+            for (int i = 0; i < handles.size(); ++i) {
+              composite_devices[composite_device].push_back(
+                  handles(i).device());
+            }
+          }
+          inst_opts.input_devices.push_back(composite_device);
+        } else {
+          inst_opts.input_devices.push_back(handle0.device());
+        }
+        const auto& dtypes_and_shapes = handle0.dtypes_and_shapes();
         // Set dtypes and shapes for resource variable inputs.
         if (!dtypes_and_shapes.empty()) {
           input_resource_variable_dtypes_and_shapes[num_non_captured_inputs +
@@ -613,6 +636,10 @@ Status CapturedFunction::Instantiate(
       }
     }
 
+    for (const auto& it : composite_devices) {
+      inst_opts.composite_devices[it.first] = &it.second;
+    }
+
     for (size_t i = 0; i < fdef->signature().output_arg_size(); ++i) {
       inst_opts.output_devices.push_back(inst_opts.target);
     }
diff --git a/tensorflow/core/kernels/data/dataset_test_base.cc b/tensorflow/core/kernels/data/dataset_test_base.cc
index b91ab9b733c..e41e35be1e9 100644
--- a/tensorflow/core/kernels/data/dataset_test_base.cc
+++ b/tensorflow/core/kernels/data/dataset_test_base.cc
@@ -220,8 +220,6 @@ Status DatasetOpsTestBase::ExpectEqual(const Tensor& a, const Tensor& b) {
     break;
     TF_CALL_NUMBER_TYPES(CASE);
     TF_CALL_tstring(CASE);
-    TF_CALL_uint32(CASE);
-    TF_CALL_uint64(CASE);
     // TODO(feihugis): figure out how to support variant tensors.
 #undef CASE
     default:
diff --git a/tensorflow/core/kernels/data/experimental/snapshot_util_test.cc b/tensorflow/core/kernels/data/experimental/snapshot_util_test.cc
index aedc0e194d7..e253014bf94 100644
--- a/tensorflow/core/kernels/data/experimental/snapshot_util_test.cc
+++ b/tensorflow/core/kernels/data/experimental/snapshot_util_test.cc
@@ -88,6 +88,7 @@ TEST(SnapshotUtilTest, CombinationRoundTripTest) {
 
   SnapshotRoundTrip(io::compression::kNone, 2);
   SnapshotRoundTrip(io::compression::kGzip, 2);
+  SnapshotRoundTrip(io::compression::kSnappy, 2);
 }
 
 void SnapshotReaderBenchmarkLoop(int iters, std::string compression_type,
@@ -195,11 +196,16 @@ void SnapshotTFRecordWriterGzipBenchmark(int iters) {
   SnapshotWriterBenchmarkLoop(iters, io::compression::kGzip, 2);
 }
 
+void SnapshotTFRecordWriterSnappyBenchmark(int iters) {
+  SnapshotWriterBenchmarkLoop(iters, io::compression::kSnappy, 2);
+}
+
 BENCHMARK(SnapshotCustomWriterNoneBenchmark);
 BENCHMARK(SnapshotCustomWriterGzipBenchmark);
 BENCHMARK(SnapshotCustomWriterSnappyBenchmark);
 BENCHMARK(SnapshotTFRecordWriterNoneBenchmark);
 BENCHMARK(SnapshotTFRecordWriterGzipBenchmark);
+BENCHMARK(SnapshotTFRecordWriterSnappyBenchmark);
 
 }  // namespace
 }  // namespace snapshot_util
diff --git a/tensorflow/core/kernels/data/experimental/sql/sqlite_query_connection.cc b/tensorflow/core/kernels/data/experimental/sql/sqlite_query_connection.cc
index e86cbc7684c..9a7eb125f95 100644
--- a/tensorflow/core/kernels/data/experimental/sql/sqlite_query_connection.cc
+++ b/tensorflow/core/kernels/data/experimental/sql/sqlite_query_connection.cc
@@ -69,7 +69,7 @@ Status SqliteQueryConnection::GetNext(IteratorContext* ctx,
 Status SqliteQueryConnection::PrepareQuery() {
   TF_RETURN_IF_ERROR(db_->Prepare(query_, &stmt_));
   int column_count = stmt_.ColumnCount();
-  if (column_count != output_types_.size()) {
+  if (column_count != static_cast<int>(output_types_.size())) {
     stmt_ = SqliteStatement();
     return errors::InvalidArgument(tensorflow::strings::Printf(
         "The number of columns in query (%d) must match the number of "
diff --git a/tensorflow/core/kernels/data/prefetch_dataset_op.cc b/tensorflow/core/kernels/data/prefetch_dataset_op.cc
index 0230bcd146d..20b78ba14ad 100644
--- a/tensorflow/core/kernels/data/prefetch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/prefetch_dataset_op.cc
@@ -28,6 +28,8 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/stringprintf.h"
+#include "tensorflow/core/profiler/lib/traceme.h"
+#include "tensorflow/core/profiler/lib/traceme_encode.h"
 #include "tensorflow/core/protobuf/error_codes.pb.h"
 
 namespace tensorflow {
@@ -303,6 +305,7 @@ class PrefetchDatasetOp::Dataset : public DatasetBase {
       // The buffered data element.
       std::vector<Tensor> value;
       int64 created_us;
+      int64 id;
     };
 
     int64 buffer_limit() const TF_EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
@@ -339,6 +342,13 @@ class PrefetchDatasetOp::Dataset : public DatasetBase {
       // (if we successfully got an element) the output values.
       Status s = buffer_.front().status;
       if (s.ok()) {
+        int64 buffer_element_id = buffer_.front().id;
+        profiler::TraceMe traceme(
+            [&] {
+              return profiler::TraceMeEncode(
+                  "PrefetchConsume", {{"element_id", buffer_element_id}});
+            },
+            profiler::kInfo);
         if (dataset()->slack_period_ > 0 &&
             (num_elements() + 1) % dataset()->slack_period_ == 0) {
           // TODO(rachelim): Consider doing something more sophisticated
@@ -423,8 +433,16 @@ class PrefetchDatasetOp::Dataset : public DatasetBase {
         mutex_lock input_l(input_mu_);
         bool end_of_sequence;
         BufferElement buffer_element;
-        buffer_element.status = input_impl_->GetNext(
-            ctx.get(), &buffer_element.value, &end_of_sequence);
+        {
+          profiler::TraceMe traceme(
+              [&] {
+                return profiler::TraceMeEncode("PrefetchProduce",
+                                               {{"element_id", num_produced}});
+              },
+              profiler::kInfo);
+          buffer_element.status = input_impl_->GetNext(
+              ctx.get(), &buffer_element.value, &end_of_sequence);
+        }
         if (buffer_element.status.ok() && end_of_sequence) {
           mutex_lock l(*mu_);
           prefetch_thread_finished_ = true;
@@ -437,6 +455,7 @@ class PrefetchDatasetOp::Dataset : public DatasetBase {
           mutex_lock l(*mu_);
           RecordBufferEnqueue(ctx.get(), buffer_element.value);
           buffer_element.created_us = EnvTime::NowMicros();
+          buffer_element.id = num_produced;
           buffer_.push_back(std::move(buffer_element));
           cond_var_->notify_all();
         }
diff --git a/tensorflow/core/kernels/decode_image_op.cc b/tensorflow/core/kernels/decode_image_op.cc
index 3f878ac6b95..8d0c0d89d43 100644
--- a/tensorflow/core/kernels/decode_image_op.cc
+++ b/tensorflow/core/kernels/decode_image_op.cc
@@ -17,7 +17,10 @@ limitations under the License.
 
 #include <memory>
 
+#define EIGEN_USE_THREADS
+
 #include "absl/strings/escaping.h"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -33,19 +36,31 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
+// Magic bytes (hex) for each image format.
+// https://en.wikipedia.org/wiki/List_of_file_signatures
+// WARNING: Changing `static const` to `constexpr` requires first checking that
+// it works with supported MSVC version.
+// https://docs.microsoft.com/en-us/cpp/cpp/constexpr-cpp?redirectedfrom=MSDN&view=vs-2019
+static const char kPngMagicBytes[] = "\x89\x50\x4E\x47\x0D\x0A\x1A\x0A";
+static const char kGifMagicBytes[] = "\x47\x49\x46\x38";
+static const char kBmpMagicBytes[] = "\x42\x4d";
+// The 4th byte of JPEG is '\xe0' or '\xe1', so check just the first three.
+static const char kJpegMagicBytes[] = "\xff\xd8\xff";
+
 enum FileFormat {
   kUnknownFormat = 0,
   kPngFormat = 1,
   kJpgFormat = 2,
   kGifFormat = 3,
+  kBmpFormat = 4,
 };
 
 // Classify the contents of a file based on starting bytes (the magic number).
 FileFormat ClassifyFileFormat(StringPiece data) {
-  // The 4th byte of JPEG is '\xe0' or '\xe1', so check just the first three
-  if (absl::StartsWith(data, "\xff\xd8\xff")) return kJpgFormat;
-  if (absl::StartsWith(data, "\x89PNG\r\n\x1a\n")) return kPngFormat;
-  if (absl::StartsWith(data, "\x47\x49\x46\x38")) return kGifFormat;
+  if (absl::StartsWith(data, kJpegMagicBytes)) return kJpgFormat;
+  if (absl::StartsWith(data, kPngMagicBytes)) return kPngFormat;
+  if (absl::StartsWith(data, kGifMagicBytes)) return kGifFormat;
+  if (absl::StartsWith(data, kBmpMagicBytes)) return kBmpFormat;
   return kUnknownFormat;
 }
 
@@ -339,11 +354,447 @@ class DecodeImageOp : public OpKernel {
   jpeg::UncompressFlags flags_;
 };
 
+// Decode an image. Supported image formats are JPEG, PNG, GIF and BMP. This is
+// a newer version of `DecodeImageOp` for enabling image data parsing to take
+// place in kernels only, reducing security vulnerabilities and redundancy.
+class DecodeImageV2Op : public OpKernel {
+ public:
+  explicit DecodeImageV2Op(OpKernelConstruction* context) : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("channels", &channels_));
+    OP_REQUIRES(
+        context,
+        channels_ == 0 || channels_ == 1 || channels_ == 3 || channels_ == 4,
+        errors::InvalidArgument("`channels` must be 0, 1, 3 or 4 but got ",
+                                channels_));
+    OP_REQUIRES_OK(context, context->GetAttr("dtype", &data_type_));
+    OP_REQUIRES(
+        context,
+        data_type_ == DataType::DT_UINT8 || data_type_ == DataType::DT_UINT16 ||
+            data_type_ == DataType::DT_FLOAT,
+        errors::InvalidArgument(
+            "`dtype` must be unit8, unit16, float but got: ", data_type_));
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("expand_animations", &expand_animations_));
+  }
+
+  // Helper for decoding BMP.
+  inline int32 ByteSwapInt32ForBigEndian(int32 x) {
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+    return le32toh(x);
+#else
+    return x;
+#endif
+  }
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& contents = context->input(0);
+    OP_REQUIRES(
+        context, TensorShapeUtils::IsScalar(contents.shape()),
+        errors::InvalidArgument("`contents` must be scalar but got shape",
+                                contents.shape().DebugString()));
+    const StringPiece input = contents.scalar<tstring>()();
+    OP_REQUIRES(context, !input.empty(),
+                errors::InvalidArgument("Input is empty."));
+    OP_REQUIRES(context, input.size() <= std::numeric_limits<int>::max(),
+                errors::InvalidArgument(
+                    "Input contents are too large for int: ", input.size()));
+
+    // Parse magic bytes to determine file format.
+    switch (ClassifyFileFormat(input)) {
+      case kJpgFormat:
+        DecodeJpegV2(context, input);
+        break;
+      case kPngFormat:
+        DecodePngV2(context, input);
+        break;
+      case kGifFormat:
+        DecodeGifV2(context, input);
+        break;
+      case kBmpFormat:
+        DecodeBmpV2(context, input);
+        break;
+      case kUnknownFormat:
+        OP_REQUIRES(context, false,
+                    errors::InvalidArgument("Unknown image file format. One of "
+                                            "JPEG, PNG, GIF, BMP required."));
+        break;
+    }
+  }
+
+  void DecodeJpegV2(OpKernelContext* context, StringPiece input) {
+    OP_REQUIRES(context, channels_ == 0 || channels_ == 1 || channels_ == 3,
+                errors::InvalidArgument("JPEG does not support 4 channels."));
+
+    // Use default settings for `DecodeImage` op. Use local copy of flags to
+    // avoid race condition as the class member is shared among different
+    // invocations.
+    jpeg::UncompressFlags flags = jpeg::UncompressFlags();
+    flags.components = channels_;
+    flags.dct_method = JDCT_IFAST;
+
+    // Output tensor and the image buffer size.
+    Tensor* output = nullptr;
+    int buffer_size = 0;
+
+    // Decode JPEG. Directly allocate to the output buffer if data type is
+    // uint8 (to save extra copying). Otherwise, allocate a new uint8 buffer
+    // with buffer size. `jpeg::Uncompress` support unit8 only.
+    uint8* buffer = jpeg::Uncompress(
+        input.data(), input.size(), flags, nullptr /* nwarn */,
+        [&](int width, int height, int channels) -> uint8* {
+          buffer_size = height * width * channels;
+          Status status = context->allocate_output(
+              0, TensorShape({height, width, channels}), &output);
+          if (!status.ok()) {
+            VLOG(1) << status;
+            context->SetStatus(status);
+            return nullptr;
+          }
+
+          if (data_type_ == DataType::DT_UINT8) {
+            return output->flat<uint8>().data();
+          } else {
+            return new uint8[buffer_size];
+          }
+        });
+
+    OP_REQUIRES(context, buffer,
+                errors::InvalidArgument("jpeg::Uncompress failed."));
+
+    // For when desired data type if unit8, the output buffer is already
+    // allocated during the `jpeg::Uncompress` call above; return.
+    if (data_type_ == DataType::DT_UINT8) {
+      return;
+    }
+    // Make sure we don't forget to deallocate `buffer`.
+    std::unique_ptr<uint8[]> buffer_unique_ptr(buffer);
+
+    // Convert uint8 image data to desired data type.
+    // Use eigen threadpooling to speed up the copy operation.
+    const auto& device = context->eigen_device<Eigen::ThreadPoolDevice>();
+    TTypes<uint8>::UnalignedConstFlat buffer_view(buffer, buffer_size);
+    if (data_type_ == DataType::DT_UINT16) {
+      uint16 scale = floor((std::numeric_limits<uint16>::max() + 1) /
+                           (std::numeric_limits<uint8>::max() + 1));
+      // Fill output tensor with desired dtype.
+      output->flat<uint16>().device(device) =
+          buffer_view.cast<uint16>() * scale;
+    } else if (data_type_ == DataType::DT_FLOAT) {
+      float scale = 1. / std::numeric_limits<uint8>::max();
+      // Fill output tensor with desired dtype.
+      output->flat<float>().device(device) = buffer_view.cast<float>() * scale;
+    }
+  }
+
+  void DecodePngV2(OpKernelContext* context, StringPiece input) {
+    int channel_bits;
+    channel_bits = (data_type_ == DataType::DT_UINT8) ? 8 : 16;
+    png::DecodeContext decode;
+    OP_REQUIRES(
+        context, png::CommonInitDecode(input, channels_, channel_bits, &decode),
+        errors::InvalidArgument("Invalid PNG. Failed to initialize decoder."));
+
+    // Verify that width and height are not too large:
+    // - verify width and height don't overflow int.
+    // - width can later be multiplied by channels_ and sizeof(uint16), so
+    //   verify single dimension is not too large.
+    // - verify when width and height are multiplied together, there are a few
+    //   bits to spare as well.
+    const int width = static_cast<int>(decode.width);
+    const int height = static_cast<int>(decode.height);
+    const int64 total_size =
+        static_cast<int64>(width) * static_cast<int64>(height);
+    if (width != static_cast<int64>(decode.width) || width <= 0 ||
+        width >= (1LL << 27) || height != static_cast<int64>(decode.height) ||
+        height <= 0 || height >= (1LL << 27) || total_size >= (1LL << 29)) {
+      png::CommonFreeDecode(&decode);
+      OP_REQUIRES(context, false,
+                  errors::InvalidArgument("PNG size too large for int: ",
+                                          decode.width, " by ", decode.height));
+    }
+
+    Tensor* output = nullptr;
+    const auto status = context->allocate_output(
+        0, TensorShape({height, width, decode.channels}), &output);
+    if (!status.ok()) png::CommonFreeDecode(&decode);
+    OP_REQUIRES_OK(context, status);
+
+    if (data_type_ == DataType::DT_UINT8) {
+      OP_REQUIRES(
+          context,
+          png::CommonFinishDecode(
+              reinterpret_cast<png_bytep>(output->flat<uint8>().data()),
+              decode.channels * width * sizeof(uint8), &decode),
+          errors::InvalidArgument("Invalid PNG data, size ", input.size()));
+    } else if (data_type_ == DataType::DT_UINT16) {
+      OP_REQUIRES(
+          context,
+          png::CommonFinishDecode(
+              reinterpret_cast<png_bytep>(output->flat<uint16>().data()),
+              decode.channels * width * sizeof(uint16), &decode),
+          errors::InvalidArgument("Invalid PNG data, size ", input.size()));
+    } else if (data_type_ == DataType::DT_FLOAT) {
+      // `png::CommonFinishDecode` does not support `float`. First allocate
+      // uint16 buffer for the image and decode in uint16 (lossless). Wrap the
+      // buffer in `unique_ptr` so that we don't forget to delete the buffer.
+      std::unique_ptr<uint16[]> buffer(
+          new uint16[height * width * decode.channels]);
+      OP_REQUIRES(
+          context,
+          png::CommonFinishDecode(reinterpret_cast<png_bytep>(buffer.get()),
+                                  decode.channels * width * sizeof(uint16),
+                                  &decode),
+          errors::InvalidArgument("Invalid PNG data, size ", input.size()));
+
+      // Convert uint16 image data to desired data type.
+      // Use eigen threadpooling to speed up the copy operation.
+      const auto& device = context->eigen_device<Eigen::ThreadPoolDevice>();
+      TTypes<uint16, 3>::UnalignedConstTensor buf(buffer.get(), height, width,
+                                                  decode.channels);
+      float scale = 1. / std::numeric_limits<uint16>::max();
+      // Fill output tensor with desired dtype.
+      output->tensor<float, 3>().device(device) = buf.cast<float>() * scale;
+    }
+  }
+
+  void DecodeGifV2(OpKernelContext* context, StringPiece input) {
+    // GIF has 3 channels.
+    OP_REQUIRES(context, channels_ == 0 || channels_ == 3,
+                errors::InvalidArgument("channels must be 0 or 3 for GIF, got ",
+                                        channels_));
+
+    // Decode GIF, allocating tensor if dtype is uint8, otherwise defer tensor
+    // allocation til after dtype conversion is done. `gif`::Decode` supports
+    // uint8 only.
+    Tensor* output = nullptr;
+    int buffer_size = 0;
+    string error_string;
+    uint8* buffer = gif::Decode(
+        input.data(), input.size(),
+        [&](int num_frames, int width, int height, int channels) -> uint8* {
+          buffer_size = num_frames * height * width * channels;
+
+          Status status;
+          if (expand_animations_) {
+            status = context->allocate_output(
+                0, TensorShape({num_frames, height, width, channels}), &output);
+          } else {
+            status = context->allocate_output(
+                0, TensorShape({height, width, channels}), &output);
+          }
+          if (!status.ok()) {
+            VLOG(1) << status;
+            context->SetStatus(status);
+            return nullptr;
+          }
+
+          if (data_type_ == DataType::DT_UINT8) {
+            return output->flat<uint8>().data();
+          } else {
+            return new uint8[buffer_size];
+          }
+        },
+        &error_string, expand_animations_);
+
+    OP_REQUIRES(context, buffer,
+                errors::InvalidArgument("Invalid GIF data (size ", input.size(),
+                                        "), ", error_string));
+
+    // For when desired data type is unit8, the output buffer is already
+    // allocated during the `gif::Decode` call above; return.
+    if (data_type_ == DataType::DT_UINT8) {
+      return;
+    }
+    // Make sure we don't forget to deallocate `buffer`.
+    std::unique_ptr<uint8[]> buffer_unique_ptr(buffer);
+
+    // Convert the raw uint8 buffer to desired dtype.
+    // Use eigen threadpooling to speed up the copy operation.
+    TTypes<uint8>::UnalignedConstFlat buffer_view(buffer, buffer_size);
+    const auto& device = context->eigen_device<Eigen::ThreadPoolDevice>();
+    if (data_type_ == DataType::DT_UINT16) {
+      uint16 scale = floor((std::numeric_limits<uint16>::max() + 1) /
+                           (std::numeric_limits<uint8>::max() + 1));
+      // Fill output tensor with desired dtype.
+      output->flat<uint16>().device(device) =
+          buffer_view.cast<uint16>() * scale;
+    } else if (data_type_ == DataType::DT_FLOAT) {
+      float scale = 1. / std::numeric_limits<uint8>::max();
+      // Fill output tensor with desired dtype.
+      output->flat<float>().device(device) = buffer_view.cast<float>() * scale;
+    }
+  }
+
+  void DecodeBmpV2(OpKernelContext* context, StringPiece input) {
+    OP_REQUIRES(context, channels_ == 0 || channels_ == 3,
+                errors::InvalidArgument(
+                    "`channels` must be 0 or 3 for BMP, but got ", channels_));
+
+    OP_REQUIRES(context, (32 <= input.size()),
+                errors::InvalidArgument("Incomplete bmp content, requires at "
+                                        "least 32 bytes to find the header "
+                                        "size, width, height, and bpp, got ",
+                                        input.size(), " bytes"));
+
+    const uint8* img_bytes = reinterpret_cast<const uint8*>(input.data());
+    int32 header_size_ = internal::SubtleMustCopy(
+        *(reinterpret_cast<const int32*>(img_bytes + 10)));
+    const int32 header_size = ByteSwapInt32ForBigEndian(header_size_);
+    int32 width_ = internal::SubtleMustCopy(
+        *(reinterpret_cast<const int32*>(img_bytes + 18)));
+    const int32 width = ByteSwapInt32ForBigEndian(width_);
+    int32 height_ = internal::SubtleMustCopy(
+        *(reinterpret_cast<const int32*>(img_bytes + 22)));
+    const int32 height = ByteSwapInt32ForBigEndian(height_);
+    int32 bpp_ = internal::SubtleMustCopy(
+        *(reinterpret_cast<const int32*>(img_bytes + 28)));
+    const int32 bpp = ByteSwapInt32ForBigEndian(bpp_);
+
+    if (channels_) {
+      OP_REQUIRES(context, (channels_ == bpp / 8),
+                  errors::InvalidArgument(
+                      "channels attribute ", channels_,
+                      " does not match bits per pixel from file ", bpp / 8));
+    } else {
+      channels_ = bpp / 8;
+    }
+
+    // Current implementation only supports 1, 3 or 4 channel
+    // bitmaps.
+    OP_REQUIRES(context, (channels_ == 1 || channels_ == 3 || channels_ == 4),
+                errors::InvalidArgument(
+                    "Number of channels must be 1, 3 or 4, was ", channels_));
+
+    OP_REQUIRES(context, width > 0,
+                errors::InvalidArgument("Width must be positive"));
+    OP_REQUIRES(context, height != 0,
+                errors::InvalidArgument("Height must be nonzero"));
+    OP_REQUIRES(context, header_size >= 0,
+                errors::InvalidArgument("header size must be nonnegative"));
+
+    // The real requirement is < 2^31 minus some headers and channel data,
+    // so rounding down to something that's still ridiculously big.
+    OP_REQUIRES(
+        context,
+        (static_cast<int64>(width) * std::abs(static_cast<int64>(height))) <
+            static_cast<int64>(std::numeric_limits<int32_t>::max() / 8),
+        errors::InvalidArgument(
+            "Total possible pixel bytes must be less than 2^30"));
+
+    const int32 abs_height = abs(height);
+
+    // there may be padding bytes when the width is not a multiple of 4 bytes
+    const int row_size = (channels_ * width + 3) / 4 * 4;
+
+    const int64 last_pixel_offset = static_cast<int64>(header_size) +
+                                    (abs_height - 1) * row_size +
+                                    (width - 1) * channels_;
+
+    // [expected file size] = [last pixel offset] + [last pixel size=channels]
+    const int64 expected_file_size = last_pixel_offset + channels_;
+
+    OP_REQUIRES(
+        context, (expected_file_size <= input.size()),
+        errors::InvalidArgument("Incomplete bmp content, requires at least ",
+                                expected_file_size, " bytes, got ",
+                                input.size(), " bytes"));
+
+    // if height is negative, data layout is top down
+    // otherwise, it's bottom up.
+    bool top_down = (height < 0);
+
+    // Decode image, allocating tensor once the image size is known.
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(
+        context, context->allocate_output(
+                     0, TensorShape({abs_height, width, channels_}), &output));
+
+    const uint8* bmp_pixels = &img_bytes[header_size];
+
+    if (data_type_ == DataType::DT_UINT8) {
+      DecodeBMP(bmp_pixels, row_size, output->flat<uint8>().data(), width,
+                abs_height, channels_, top_down);
+    } else {
+      std::unique_ptr<uint8[]> buffer(new uint8[height * width * channels_]);
+      DecodeBMP(bmp_pixels, row_size, buffer.get(), width, abs_height,
+                channels_, top_down);
+      TTypes<uint8, 3>::UnalignedConstTensor buf(buffer.get(), height, width,
+                                                 channels_);
+      // Convert the raw uint8 buffer to desired dtype.
+      // Use eigen threadpooling to speed up the copy operation.
+      const auto& device = context->eigen_device<Eigen::ThreadPoolDevice>();
+      if (data_type_ == DataType::DT_UINT16) {
+        uint16 scale = floor((std::numeric_limits<uint16>::max() + 1) /
+                             (std::numeric_limits<uint8>::max() + 1));
+        // Fill output tensor with desired dtype.
+        output->tensor<uint16, 3>().device(device) = buf.cast<uint16>() * scale;
+      } else if (data_type_ == DataType::DT_FLOAT) {
+        float scale = 1. / std::numeric_limits<uint8>::max();
+        // Fill output tensor with desired dtype.
+        output->tensor<float, 3>().device(device) = buf.cast<float>() * scale;
+      }
+    }
+  }
+
+  void DecodeBMP(const uint8* input, const int row_size, uint8* const output,
+                 const int width, const int height, const int channels,
+                 bool top_down);
+
+ private:
+  int channels_ = 0;
+  DataType data_type_;
+  bool expand_animations_;
+};
+
 REGISTER_KERNEL_BUILDER(Name("DecodeJpeg").Device(DEVICE_CPU), DecodeImageOp);
 REGISTER_KERNEL_BUILDER(Name("DecodePng").Device(DEVICE_CPU), DecodeImageOp);
 REGISTER_KERNEL_BUILDER(Name("DecodeGif").Device(DEVICE_CPU), DecodeImageOp);
 REGISTER_KERNEL_BUILDER(Name("DecodeAndCropJpeg").Device(DEVICE_CPU),
                         DecodeImageOp);
+REGISTER_KERNEL_BUILDER(Name("DecodeImage").Device(DEVICE_CPU),
+                        DecodeImageV2Op);
+
+void DecodeImageV2Op::DecodeBMP(const uint8* input, const int row_size,
+                                uint8* const output, const int width,
+                                const int height, const int channels,
+                                bool top_down) {
+  for (int i = 0; i < height; i++) {
+    int src_pos;
+    int dst_pos;
+
+    for (int j = 0; j < width; j++) {
+      if (!top_down) {
+        src_pos = ((height - 1 - i) * row_size) + j * channels;
+      } else {
+        src_pos = i * row_size + j * channels;
+      }
+
+      dst_pos = (i * width + j) * channels;
+
+      switch (channels) {
+        case 1:
+          output[dst_pos] = input[src_pos];
+          break;
+        case 3:
+          // BGR -> RGB
+          output[dst_pos] = input[src_pos + 2];
+          output[dst_pos + 1] = input[src_pos + 1];
+          output[dst_pos + 2] = input[src_pos];
+          break;
+        case 4:
+          // BGRA -> RGBA
+          output[dst_pos] = input[src_pos + 2];
+          output[dst_pos + 1] = input[src_pos + 1];
+          output[dst_pos + 2] = input[src_pos];
+          output[dst_pos + 3] = input[src_pos + 3];
+          break;
+        default:
+          LOG(FATAL) << "Unexpected number of channels: " << channels;
+          break;
+      }
+    }
+  }
+}
 
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/dense_update_ops.cc b/tensorflow/core/kernels/dense_update_ops.cc
index 55e4cd7606a..71235fca143 100644
--- a/tensorflow/core/kernels/dense_update_ops.cc
+++ b/tensorflow/core/kernels/dense_update_ops.cc
@@ -98,7 +98,6 @@ typedef Eigen::SyclDevice SYCLDevice;
 
 TF_CALL_ALL_TYPES(REGISTER_KERNELS);
 // uint32 not included in ALL_TYPES
-TF_CALL_uint32(REGISTER_KERNELS);
 TF_CALL_QUANTIZED_TYPES(REGISTER_KERNELS);
 // quint16 not included in QUANTIZIED_TYPES
 TF_CALL_quint16(REGISTER_KERNELS);
diff --git a/tensorflow/core/kernels/depthwise_conv_op_gpu.h b/tensorflow/core/kernels/depthwise_conv_op_gpu.h
index f712d9c48c0..f69878bea89 100644
--- a/tensorflow/core/kernels/depthwise_conv_op_gpu.h
+++ b/tensorflow/core/kernels/depthwise_conv_op_gpu.h
@@ -1763,7 +1763,7 @@ void LaunchDepthwiseConvBackpropFilterOp<GpuDevice, T>::operator()(
   int num_filter_backprop =
       args.filter_rows * args.filter_cols * args.out_depth;
   se::DeviceMemoryBase filter_bp_ptr(filter_backprop, num_filter_backprop);
-  stream->ThenMemset32(&filter_bp_ptr, 0, num_filter_backprop * sizeof(T));
+  stream->ThenMemZero(&filter_bp_ptr, num_filter_backprop * sizeof(T));
 
   if (args.filter_rows == 3 && args.filter_cols == 3) {
     OP_REQUIRES_OK(
diff --git a/tensorflow/core/kernels/dynamic_partition_op.cc b/tensorflow/core/kernels/dynamic_partition_op.cc
index 90ed71dccce..95af19c4c48 100644
--- a/tensorflow/core/kernels/dynamic_partition_op.cc
+++ b/tensorflow/core/kernels/dynamic_partition_op.cc
@@ -164,8 +164,6 @@ class DynamicPartitionOp : public DynamicPartitionOp_Shared {
       DynamicPartitionOp<T>)
 
 TF_CALL_ALL_TYPES(REGISTER_DYNAMIC_PARTITION);
-// For partitioning fingerprints.
-TF_CALL_uint64(REGISTER_DYNAMIC_PARTITION);
 #undef REGISTER_DYNAMIC_PARTITION
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/eigen_contraction_kernel.h b/tensorflow/core/kernels/eigen_contraction_kernel.h
index 7233020c1c1..ef4b9dbc012 100644
--- a/tensorflow/core/kernels/eigen_contraction_kernel.h
+++ b/tensorflow/core/kernels/eigen_contraction_kernel.h
@@ -171,6 +171,7 @@ struct mkldnn_gemm_kernel</*Scalar*/ float, IndexType, OutputMapper,
 #if DYNAMIC_ANNOTATIONS_ENABLED == 1 || defined(MEMORY_SANITIZER)
     for (IndexType col = 0; col < cols; ++col) {
       ResScalar* row_base = &output(0, col);
+      EIGEN_UNUSED_VARIABLE(row_base);  // Suppress unused variable error.
       TF_ANNOTATE_MEMORY_IS_INITIALIZED(row_base, sizeof(ResScalar) * rows);
     }
 #endif
@@ -241,6 +242,7 @@ struct mkldnn_gemm_s8u8s32_kernel {
 #if DYNAMIC_ANNOTATIONS_ENABLED == 1 || defined(MEMORY_SANITIZER)
     for (IndexType col = 0; col < cols; ++col) {
       ResScalar* row_base = &output(0, col);
+      EIGEN_UNUSED_VARIABLE(row_base);  // Suppress unused variable error.
       TF_ANNOTATE_MEMORY_IS_INITIALIZED(row_base, sizeof(ResScalar) * rows);
     }
 #endif
diff --git a/tensorflow/core/kernels/fill_functor.cc b/tensorflow/core/kernels/fill_functor.cc
index 10dd3df1915..174a4e45a79 100644
--- a/tensorflow/core/kernels/fill_functor.cc
+++ b/tensorflow/core/kernels/fill_functor.cc
@@ -45,6 +45,8 @@ DEFINE_SETZERO_CPU(Eigen::half);
 DEFINE_SETZERO_CPU(bfloat16);
 DEFINE_SETZERO_CPU(float);
 DEFINE_SETZERO_CPU(double);
+DEFINE_SETZERO_CPU(uint32);
+DEFINE_SETZERO_CPU(uint64);
 DEFINE_SETZERO_CPU(uint8);
 DEFINE_SETZERO_CPU(int8);
 DEFINE_SETZERO_CPU(uint16);
@@ -96,6 +98,8 @@ DEFINE_SETONE_CPU(Eigen::half);
 DEFINE_SETONE_CPU(bfloat16);
 DEFINE_SETONE_CPU(float);
 DEFINE_SETONE_CPU(double);
+DEFINE_SETONE_CPU(uint32);
+DEFINE_SETONE_CPU(uint64);
 DEFINE_SETONE_CPU(uint8);
 DEFINE_SETONE_CPU(int8);
 DEFINE_SETONE_CPU(uint16);
@@ -137,7 +141,6 @@ struct FillFunctor<Eigen::ThreadPoolDevice, T> {
 TF_CALL_ALL_TYPES(DEFINE_FILL_CPU);
 DEFINE_FILL_CPU(quint8);
 DEFINE_FILL_CPU(quint16);
-DEFINE_FILL_CPU(uint32);
 #undef DEFINE_FILL_CPU
 
 #ifdef TENSORFLOW_USE_SYCL
diff --git a/tensorflow/core/kernels/gather_op.cc b/tensorflow/core/kernels/gather_op.cc
index 6d493a5f2ea..948567e019a 100644
--- a/tensorflow/core/kernels/gather_op.cc
+++ b/tensorflow/core/kernels/gather_op.cc
@@ -211,8 +211,6 @@ TF_CALL_ALL_TYPES(REGISTER_GATHER_CPU);
 TF_CALL_QUANTIZED_TYPES(REGISTER_GATHER_CPU);
 TF_CALL_quint16(REGISTER_GATHER_CPU);
 TF_CALL_qint16(REGISTER_GATHER_CPU);
-TF_CALL_uint32(REGISTER_GATHER_CPU);
-TF_CALL_uint64(REGISTER_GATHER_CPU);
 
 #undef REGISTER_GATHER_CPU
 
diff --git a/tensorflow/core/kernels/hexagon/graph_transferer.cc b/tensorflow/core/kernels/hexagon/graph_transferer.cc
index 7f15f3ab20d..9d6d0563b5f 100644
--- a/tensorflow/core/kernels/hexagon/graph_transferer.cc
+++ b/tensorflow/core/kernels/hexagon/graph_transferer.cc
@@ -667,7 +667,7 @@ void GraphTransferer::RegisterNodeWithPaddingAndStrides(
       << "Op " << node.type_string() << " not found in map(id = " << op_type_id
       << ")";
   // Safety check of padding id
-  CHECK(padding == Padding::VALID ? 1 : 2);
+  CHECK(padding == Padding::SAME);
   AppendNodeParamsWithIoParams(
       shape_refiner, node, node.name(), id, node.type_string(), op_type_id,
       static_cast<int>(padding), node.num_inputs(), extra_inputs,
diff --git a/tensorflow/core/kernels/identity_op.cc b/tensorflow/core/kernels/identity_op.cc
index fd94df9a768..4b226dd72d4 100644
--- a/tensorflow/core/kernels/identity_op.cc
+++ b/tensorflow/core/kernels/identity_op.cc
@@ -122,7 +122,7 @@ REGISTER_SYCL_HOST_KERNEL(bool);
 
 TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_KERNEL);
 REGISTER_GPU_KERNEL(Variant);
-TF_CALL_uint32(REGISTER_GPU_KERNEL);
+REGISTER_GPU_KERNEL(bool);
 
 #undef REGISTER_GPU_KERNEL
 
@@ -158,7 +158,6 @@ TF_CALL_uint32(REGISTER_GPU_KERNEL);
                           IdentityOp)
 
 REGISTER_GPU_HOST_KERNEL(int32);
-REGISTER_GPU_HOST_KERNEL(bool);
 REGISTER_GPU_HOST_KERNEL(tstring);
 REGISTER_GPU_HOST_KERNEL(ResourceHandle);
 
diff --git a/tensorflow/core/kernels/lookup_table_op.cc b/tensorflow/core/kernels/lookup_table_op.cc
index 9807247ed4f..f269aa65b4e 100644
--- a/tensorflow/core/kernels/lookup_table_op.cc
+++ b/tensorflow/core/kernels/lookup_table_op.cc
@@ -557,7 +557,7 @@ class MutableDenseHashTable final : public LookupInterface {
 
   TensorShape value_shape() const override { return value_shape_; }
 
-  int64 MemoryUsed() const override {
+  int64 MemoryUsed() const override TF_LOCKS_EXCLUDED(mu_) {
     tf_shared_lock l(mu_);
     return sizeof(MutableDenseHashTable) + key_buckets_.AllocatedBytes() +
            value_buckets_.AllocatedBytes() + empty_key_.AllocatedBytes();
diff --git a/tensorflow/core/kernels/mkl_aggregate_ops.cc b/tensorflow/core/kernels/mkl_aggregate_ops.cc
index ec5f80cb3fa..90e0ea9aa95 100644
--- a/tensorflow/core/kernels/mkl_aggregate_ops.cc
+++ b/tensorflow/core/kernels/mkl_aggregate_ops.cc
@@ -178,6 +178,9 @@ class MklAddNOp : public OpKernel {
         dnn_fmt = MklTensorFormatToMklDnnDataFormat(mkl_data_format);
       }
 
+      std::shared_ptr<stream> fwd_cpu_stream;
+      fwd_cpu_stream.reset(CreateStream(ctx, cpu_engine));
+
       // Create memory descriptor for MKL-DNN.
       // If all input in Tensorflow format, create block memory descriptor,
       // else convert TF format to MKL memory descriptor
@@ -215,6 +218,7 @@ class MklAddNOp : public OpKernel {
         srcs_pd.push_back(memory::primitive_desc(md, cpu_engine));
 #endif
         src.SetUsrMem(md, &src_tensor);
+        src.SetUsrMemDataHandle(&src_tensor, fwd_cpu_stream);
         inputs.push_back(src.GetOpMem());
       }
 
@@ -240,11 +244,10 @@ class MklAddNOp : public OpKernel {
       }
       AllocateOutputSetMklShape(ctx, kOutputIdx, &dst_tensor, output_tf_shape,
                                 output_mkl_shape);
-      dst.SetUsrMemDataHandle(dst_tensor);
+      dst.SetUsrMemDataHandle(dst_tensor, fwd_cpu_stream);
 
       // Create Sum op, and submit net for execution.
       std::vector<primitive> net;
-      stream* fwd_cpu_stream = CreateStream(ctx, cpu_engine);
 #ifdef ENABLE_MKLDNN_V1
       mkldnn::sum sum_op(sum_pd);
       std::unordered_map<int, memory> net_args = {
diff --git a/tensorflow/core/kernels/mkl_concat_op.cc b/tensorflow/core/kernels/mkl_concat_op.cc
index 976f778424e..4a5cb0a0d4f 100644
--- a/tensorflow/core/kernels/mkl_concat_op.cc
+++ b/tensorflow/core/kernels/mkl_concat_op.cc
@@ -281,11 +281,19 @@ class MklConcatFwdPrimitive : public MklPrimitive {
                std::shared_ptr<stream> fwd_stream) {
     DCHECK_EQ(in_data.size(), context_.data_mem.size());
     for (size_t i = 0; i < concat_fwd_dims.num_inputs; i++) {
+#ifdef ENABLE_MKLDNN_THREADPOOL
+      context_.data_mem_shdptr[i]->set_data_handle(
+          static_cast<void*>(in_data[i].get_data_handle()), *fwd_stream);
+    }
+    context_.dst_mem->set_data_handle(
+        static_cast<void*>(dst_data.get_data_handle()), *fwd_stream);
+#else
       context_.data_mem_shdptr[i]->set_data_handle(
           static_cast<void*>(in_data[i].get_data_handle()));
     }
     context_.dst_mem->set_data_handle(
         static_cast<void*>(dst_data.get_data_handle()));
+#endif  // ENABLE_MKLDNN_THREADPOOL
 
     for (size_t i = 0; i < concat_fwd_dims.num_inputs; i++) {
       context_.data_mem[i] = *context_.data_mem_shdptr[i];
@@ -788,11 +796,13 @@ class MklConcatOp : public OpKernel {
                                     dnn_shape_dst);
           DCHECK(dst_tensor != nullptr) << "Output tensor pointer is NULL";
 
+          std::shared_ptr<stream> fwd_cpu_stream;
+          fwd_cpu_stream.reset(CreateStream(context, cpu_engine));
+
           if (dnn_shape_dst.IsMklTensor())
             dst_md = dnn_shape_dst.GetMklLayout();
           dst.SetUsrMem(dst_md, dst_tensor);
-          std::shared_ptr<stream> fwd_cpu_stream;
-          fwd_cpu_stream.reset(CreateStream(context, cpu_engine));
+          dst.SetUsrMemDataHandle(dst_tensor, fwd_cpu_stream);
 #ifdef ENABLE_MKLDNN_V1
           auto concat_op = concat(concat_pd);
           std::unordered_map<int, memory> net_args = {
@@ -830,9 +840,10 @@ class MklConcatOp : public OpKernel {
 
           dst_md = dnn_shape_dst.IsMklTensor() ? dnn_shape_dst.GetMklLayout()
                                                : dst_md;
-          dst.SetUsrMem(dst_md, dst_tensor);
           std::shared_ptr<stream> fwd_cpu_stream;
           fwd_cpu_stream.reset(CreateStream(context, concat_fwd->GetEngine()));
+          dst.SetUsrMem(dst_md, dst_tensor);
+          dst.SetUsrMemDataHandle(dst_tensor, fwd_cpu_stream);
           // Execute concat
           concat_fwd->Execute(srcs_mem, dst.GetOpMem(), concat_fwd_dims,
                               fwd_cpu_stream);
diff --git a/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc b/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc
index 4c3cea4b6ff..12581d0bfa5 100644
--- a/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc
+++ b/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc
@@ -114,6 +114,21 @@ class MklConvBwdFilterPrimitive : public MklPrimitive {
   void Execute(const T* src_data, const T* diff_filter_data,
                const T* diff_bias_data, const T* diff_dst_data,
                std::shared_ptr<stream> bwd_filter_stream) {
+    // TODO: Create a common function and avoid the duplicate code
+#ifdef ENABLE_MKLDNN_THREADPOOL
+    context_.src_mem->set_data_handle(
+        static_cast<void*>(const_cast<T*>(src_data)), *bwd_filter_stream);
+    context_.diff_filter_mem->set_data_handle(
+        static_cast<void*>(const_cast<T*>(diff_filter_data)),
+        *bwd_filter_stream);
+    if (diff_bias_data != nullptr) {
+      context_.diff_bias_mem->set_data_handle(
+          static_cast<void*>(const_cast<T*>(diff_bias_data)),
+          *bwd_filter_stream);
+    }
+    context_.diff_dst_mem->set_data_handle(
+        static_cast<void*>(const_cast<T*>(diff_dst_data)), *bwd_filter_stream);
+#else
     context_.src_mem->set_data_handle(
         static_cast<void*>(const_cast<T*>(src_data)));
     context_.diff_filter_mem->set_data_handle(
@@ -124,7 +139,7 @@ class MklConvBwdFilterPrimitive : public MklPrimitive {
     }
     context_.diff_dst_mem->set_data_handle(
         static_cast<void*>(const_cast<T*>(diff_dst_data)));
-
+#endif  // ENABLE_MKLDNN_THREADPOOL
 #ifdef ENABLE_MKLDNN_V1
     execute_primitives(context_.bwd_filter_primitives, bwd_filter_stream,
                        context_.bwd_filter_primitives_args);
diff --git a/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc b/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc
index f9c8d11c67c..7177431029a 100644
--- a/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc
+++ b/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc
@@ -116,13 +116,22 @@ class MklConvBwdInputPrimitive : public MklPrimitive {
   void Execute(const T* diff_src_data, const T* filter_data,
                const T* diff_dst_data,
                std::shared_ptr<stream> bwd_input_stream) {
+    // TODO: Create a common function and avoid the duplicate code
+#ifdef ENABLE_MKLDNN_THREADPOOL
+    context_.diff_src_mem->set_data_handle(
+        static_cast<T*>(const_cast<T*>(diff_src_data)), *bwd_input_stream);
+    context_.filter_mem->set_data_handle(
+        static_cast<T*>(const_cast<T*>(filter_data)), *bwd_input_stream);
+    context_.diff_dst_mem->set_data_handle(
+        static_cast<T*>(const_cast<T*>(diff_dst_data)), *bwd_input_stream);
+#else
     context_.diff_src_mem->set_data_handle(
         static_cast<T*>(const_cast<T*>(diff_src_data)));
     context_.filter_mem->set_data_handle(
         static_cast<T*>(const_cast<T*>(filter_data)));
     context_.diff_dst_mem->set_data_handle(
         static_cast<T*>(const_cast<T*>(diff_dst_data)));
-
+#endif  // ENABLE_MKLDNN_THREADPOOL
 #ifdef ENABLE_MKLDNN_V1
     execute_primitives(context_.bwd_input_primitives, bwd_input_stream,
                        context_.bwd_input_primitives_args);
diff --git a/tensorflow/core/kernels/mkl_conv_ops.cc b/tensorflow/core/kernels/mkl_conv_ops.cc
index 7d0510d03ac..210044436aa 100644
--- a/tensorflow/core/kernels/mkl_conv_ops.cc
+++ b/tensorflow/core/kernels/mkl_conv_ops.cc
@@ -110,6 +110,19 @@ class MklConvFwdPrimitive : public MklPrimitive {
   void Execute(const Tinput* src_data, const Tfilter* filter_data,
                const Tbias* bias_data, const Toutput* dst_data,
                std::shared_ptr<stream> fwd_stream) {
+    // TODO: Create a common function and avoid the duplicate code
+#ifdef ENABLE_MKLDNN_THREADPOOL
+    context_.src_mem->set_data_handle(
+        static_cast<void*>(const_cast<Tinput*>(src_data)), *fwd_stream);
+    context_.filter_mem->set_data_handle(
+        static_cast<void*>(const_cast<Tfilter*>(filter_data)), *fwd_stream);
+    if (bias_data != nullptr) {
+      context_.bias_mem->set_data_handle(
+          static_cast<void*>(const_cast<Tbias*>(bias_data)), *fwd_stream);
+    }
+    context_.dst_mem->set_data_handle(
+        static_cast<void*>(const_cast<Toutput*>(dst_data)), *fwd_stream);
+#else
     context_.src_mem->set_data_handle(
         static_cast<void*>(const_cast<Tinput*>(src_data)));
     context_.filter_mem->set_data_handle(
@@ -120,6 +133,7 @@ class MklConvFwdPrimitive : public MklPrimitive {
     }
     context_.dst_mem->set_data_handle(
         static_cast<void*>(const_cast<Toutput*>(dst_data)));
+#endif  // ENABLE_MKLDNN_THREADPOOL
 #ifdef ENABLE_MKLDNN_V1
     DCHECK_EQ(context_.fwd_primitives.size(),
               context_.fwd_primitives_args.size());
diff --git a/tensorflow/core/kernels/mkl_dequantize_op.cc b/tensorflow/core/kernels/mkl_dequantize_op.cc
index 06570c1db1c..82d78250576 100644
--- a/tensorflow/core/kernels/mkl_dequantize_op.cc
+++ b/tensorflow/core/kernels/mkl_dequantize_op.cc
@@ -75,6 +75,9 @@ class MklDequantizeOp : public OpKernel {
       MklDnnData<T> src(&cpu_engine);
       MklDnnData<float> dst(&cpu_engine);
 
+      std::shared_ptr<stream> reorder_stream;
+      reorder_stream.reset(CreateStream(ctx, cpu_engine));
+
       // If input is in MKL layout, then simply grab input layout; otherwise,
       // construct input TF layout. For TF layout, although input shape
       // (src_dims) required is in MKL-DNN order, the layout is Tensorflow's
@@ -85,6 +88,7 @@ class MklDequantizeOp : public OpKernel {
               : memory::desc(src_dims, MklDnnType<T>(), MEMORY_FORMAT::nhwc);
 
       src.SetUsrMem(src_md, &src_tensor);
+      src.SetUsrMemDataHandle(&src_tensor, reorder_stream);
 
       Tensor* output_tensor = nullptr;
       MklDnnShape output_mkl_shape;
@@ -129,6 +133,7 @@ class MklDequantizeOp : public OpKernel {
       AllocateOutputSetMklShape(ctx, 0, &output_tensor, output_tf_shape,
                                 output_mkl_shape);
       dst.SetUsrMem(dst_md, output_tensor);
+      dst.SetUsrMemDataHandle(output_tensor, reorder_stream);
 
       // The quantization logic here for mode SCALED is similar to the logic
       // in QuantizeAndDequantizeV2 and QuantizeAndDequantizeV3.
@@ -155,8 +160,6 @@ class MklDequantizeOp : public OpKernel {
       // Also it does not define round_nearest (enum).
       attr.set_int_output_round_mode(mkldnn::round_mode::round_nearest);
 #endif  // !ENABLE_MKLDNN_V1
-      std::shared_ptr<stream> reorder_stream;
-      reorder_stream.reset(CreateStream(ctx, cpu_engine));
       std::vector<primitive> net;
 
       // Create reorder primitive and then execute.
diff --git a/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc b/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc
index 954ae0492df..3b2c4f84039 100644
--- a/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc
+++ b/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc
@@ -94,6 +94,28 @@ class MklFusedBatchNormFwdPrimitive : public MklPrimitive {
   void Execute(const T* src_data, const U* weights_data, T* dst_data,
                U* mean_data, U* variance_data,
                std::shared_ptr<stream> fwd_stream, U* workspace_data) {
+    // TODO: Create a common function and avoid the duplicate code
+#ifdef ENABLE_MKLDNN_THREADPOOL
+    context_.src_mem->set_data_handle(
+        static_cast<void*>(const_cast<T*>(src_data)), *fwd_stream);
+    context_.dst_mem->set_data_handle(static_cast<void*>(dst_data),
+                                      *fwd_stream);
+
+    if (IS_SET(use_scale_shift))
+      context_.weights_mem->set_data_handle(
+          static_cast<void*>(const_cast<U*>(weights_data)), *fwd_stream);
+
+    if ((context_.pkind == prop_kind::forward_training) ||
+        (IS_SET(use_global_stats))) {
+      context_.mean_mem->set_data_handle(static_cast<void*>(mean_data),
+                                         *fwd_stream);
+      context_.variance_mem->set_data_handle(static_cast<void*>(variance_data),
+                                             *fwd_stream);
+    }
+    if (workspace_data != nullptr) {
+      context_.ws_mem->set_data_handle(workspace_data, *fwd_stream);
+    }
+#else
     context_.src_mem->set_data_handle(
         static_cast<void*>(const_cast<T*>(src_data)));
     context_.dst_mem->set_data_handle(static_cast<void*>(dst_data));
@@ -110,6 +132,7 @@ class MklFusedBatchNormFwdPrimitive : public MklPrimitive {
     if (workspace_data != nullptr) {
       context_.ws_mem->set_data_handle(workspace_data);
     }
+#endif  // ENABLE_MKLDNN_THREADPOOL
 #ifdef ENABLE_MKLDNN_V1
     // Execute batch-normalization forward primitives.
     execute_primitives(context_.fwd_primitives, fwd_stream, context_.net_args);
@@ -503,6 +526,27 @@ class MklFusedBatchNormBwdPrimitive : public MklPrimitive {
                const T* diff_dst_data, const U* weights_data, T* diff_src_data,
                U* diff_weights_data, U* res_space_data,
                std::shared_ptr<stream> bwd_stream) {
+    // TODO: Create a common function and avoid the duplicate code
+#ifdef ENABLE_MKLDNN_THREADPOOL
+    context_.src_mem->set_data_handle(
+        static_cast<void*>(const_cast<T*>(src_data)), *bwd_stream);
+    context_.mean_mem->set_data_handle(
+        static_cast<void*>(const_cast<U*>(mean_data)), *bwd_stream);
+    context_.variance_mem->set_data_handle(
+        static_cast<void*>(const_cast<U*>(variance_data)), *bwd_stream);
+    context_.diff_dst_mem->set_data_handle(
+        static_cast<void*>(const_cast<T*>(diff_dst_data)), *bwd_stream);
+
+    if (IS_SET(use_scale_shift)) {
+      context_.weights_mem->set_data_handle(
+          static_cast<void*>(const_cast<U*>(weights_data)), *bwd_stream);
+      context_.diff_weights_mem->set_data_handle(
+          static_cast<void*>(diff_weights_data), *bwd_stream);
+    }
+
+    context_.diff_src_mem->set_data_handle(static_cast<void*>(diff_src_data),
+                                           *bwd_stream);
+#else
     context_.src_mem->set_data_handle(
         static_cast<void*>(const_cast<T*>(src_data)));
     context_.mean_mem->set_data_handle(
@@ -520,7 +564,7 @@ class MklFusedBatchNormBwdPrimitive : public MklPrimitive {
     }
 
     context_.diff_src_mem->set_data_handle(static_cast<void*>(diff_src_data));
-
+#endif  // ENABLE_MKLDNN_THREADPOOL
 #ifdef ENABLE_MKLDNN_V1
     // Execute backward batch-normalization primitives.
     DCHECK_EQ(context_.bwd_primitives.size(), context_.net_args.size());
diff --git a/tensorflow/core/kernels/mkl_lrn_op.cc b/tensorflow/core/kernels/mkl_lrn_op.cc
index a11e7ebcbf5..3e512d0792b 100644
--- a/tensorflow/core/kernels/mkl_lrn_op.cc
+++ b/tensorflow/core/kernels/mkl_lrn_op.cc
@@ -137,6 +137,7 @@ class MklLRNOp : public OpKernel {
       // that input is in NHWC layout with Channel being the last dimension.
       src_dnn_data.SetUsrMem(src_md, &src_tensor);
       src_dnn_data.SetOpMemDesc(input_dims, MEMORY_FORMAT::nhwc);
+      src_dnn_data.SetUsrMemDataHandle(&src_tensor, fwd_stream_);
 
       // dst_dnn_data has the same shape as input.
       dst_dnn_data.SetUsrMem(src_md);
@@ -157,7 +158,7 @@ class MklLRNOp : public OpKernel {
                            &output_tensor);
       OP_REQUIRES_OK(context, context->status());
       DCHECK(output_tensor != nullptr);
-      dst_dnn_data.SetUsrMemDataHandle(output_tensor);
+      dst_dnn_data.SetUsrMemDataHandle(output_tensor, fwd_stream_);
 
       // Handle workspace required for MKL-DNN.
       AllocateWorkspaceTensor(context, lrn_prim_desc, &workspace_dnn_data);
@@ -393,6 +394,7 @@ class MklLRNGradOp : public OpKernel {
           orig_input_dnn_shape.GetSizesAsMklDnnDims();
       orig_input_dnn_data.SetUsrMem(orig_input_md, &orig_input_tensor);
       orig_input_dnn_data.SetOpMemDesc(orig_input_dims, MEMORY_FORMAT::nhwc);
+      orig_input_dnn_data.SetUsrMemDataHandle(&orig_input_tensor, bwd_stream_);
 
       // output_dnn_data has the same shape as original input
       output_dnn_data.SetUsrMem(orig_input_md);
@@ -421,7 +423,7 @@ class MklLRNGradOp : public OpKernel {
                            orig_input_format, &output_tensor);
       OP_REQUIRES_OK(context, context->status());
       DCHECK(output_tensor != nullptr);
-      output_dnn_data.SetUsrMemDataHandle(output_tensor);
+      output_dnn_data.SetUsrMemDataHandle(output_tensor, bwd_stream_);
 
       // Create LRN primitive and add it to the net
       // At this point, workspace is enabled, so we don't need
diff --git a/tensorflow/core/kernels/mkl_pooling_ops_common.cc b/tensorflow/core/kernels/mkl_pooling_ops_common.cc
index 2dfc6db0075..5f1c9129ec3 100644
--- a/tensorflow/core/kernels/mkl_pooling_ops_common.cc
+++ b/tensorflow/core/kernels/mkl_pooling_ops_common.cc
@@ -127,6 +127,17 @@ template <typename T>
 void MklPoolingFwdPrimitive<T>::Execute(const T* src_data, T* dst_data,
                                         void* ws_data,
                                         std::shared_ptr<stream> fwd_stream) {
+#ifdef ENABLE_MKLDNN_THREADPOOL
+  context_.src_mem->set_data_handle(
+      static_cast<void*>(const_cast<T*>(src_data)), *fwd_stream);
+  context_.dst_mem->set_data_handle(static_cast<void*>(dst_data), *fwd_stream);
+  if (context_.alg_kind == ALGORITHM::pooling_max &&
+      context_.prop_kind ==
+          prop_kind::forward_training) {  // Max pooling must have workspace.
+    DCHECK(ws_data != nullptr);
+    context_.ws_mem->set_data_handle(ws_data, *fwd_stream);
+  }
+#else
   context_.src_mem->set_data_handle(
       static_cast<void*>(const_cast<T*>(src_data)));
   context_.dst_mem->set_data_handle(static_cast<void*>(dst_data));
@@ -136,7 +147,7 @@ void MklPoolingFwdPrimitive<T>::Execute(const T* src_data, T* dst_data,
     DCHECK(ws_data != nullptr);
     context_.ws_mem->set_data_handle(ws_data);
   }
-
+#endif  // ENABLE_MKLDNN_THREADPOOL
 #ifdef ENABLE_MKLDNN_V1
   execute_primitives(context_.fwd_primitives, fwd_stream, context_.net_args);
 #else
@@ -269,6 +280,16 @@ template <typename T>
 void MklPoolingBwdPrimitive<T>::Execute(const T* diff_dst_data,
                                         T* diff_src_data, const void* ws_data,
                                         std::shared_ptr<stream> bwd_stream) {
+#ifdef ENABLE_MKLDNN_THREADPOOL
+  context_.diff_dst_mem->set_data_handle(
+      static_cast<void*>(const_cast<T*>(diff_dst_data)), *bwd_stream);
+  context_.diff_src_mem->set_data_handle(static_cast<void*>(diff_src_data),
+                                         *bwd_stream);
+  if (context_.alg_kind == ALGORITHM::pooling_max) {
+    DCHECK(ws_data != nullptr);
+    context_.ws_mem->set_data_handle(const_cast<void*>(ws_data), *bwd_stream);
+  }
+#else
   context_.diff_dst_mem->set_data_handle(
       static_cast<void*>(const_cast<T*>(diff_dst_data)));
   context_.diff_src_mem->set_data_handle(static_cast<void*>(diff_src_data));
@@ -276,7 +297,7 @@ void MklPoolingBwdPrimitive<T>::Execute(const T* diff_dst_data,
     DCHECK(ws_data != nullptr);
     context_.ws_mem->set_data_handle(const_cast<void*>(ws_data));
   }
-
+#endif  // ENABLE_MKLDNN_THREADPOOL
 #ifdef ENABLE_MKLDNN_V1
   execute_primitives(context_.bwd_primitives, bwd_stream, context_.net_args);
 #else
diff --git a/tensorflow/core/kernels/mkl_quantize_op.cc b/tensorflow/core/kernels/mkl_quantize_op.cc
index 5adb9862250..177cbb43d0b 100644
--- a/tensorflow/core/kernels/mkl_quantize_op.cc
+++ b/tensorflow/core/kernels/mkl_quantize_op.cc
@@ -88,8 +88,13 @@ class MklReorderWithScalePrimitive : public MklPrimitive {
 
   void Execute(void* src_data, void* dst_data,
                std::shared_ptr<stream> reorder_stream) {
+#ifdef ENABLE_MKLDNN_THREADPOOL
+    context_.src_mem->set_data_handle(src_data, *reorder_stream);
+    context_.dst_mem->set_data_handle(dst_data, *reorder_stream);
+#else
     context_.src_mem->set_data_handle(src_data);
     context_.dst_mem->set_data_handle(dst_data);
+#endif  // ENABLE_MKLDNN_THREADPOOL
 #ifndef ENABLE_MKLDNN_V1
     reorder_stream->submit(context_.net);
 #else
diff --git a/tensorflow/core/kernels/mkl_relu_op.cc b/tensorflow/core/kernels/mkl_relu_op.cc
index dbf3f37d29f..5d52742d558 100644
--- a/tensorflow/core/kernels/mkl_relu_op.cc
+++ b/tensorflow/core/kernels/mkl_relu_op.cc
@@ -79,10 +79,16 @@ class MklEltwiseFwdPrimitive : public MklPrimitive {
   //   dst_data:  output data buffer of dst
   void Execute(const T* src_data, T* dst_data,
                std::shared_ptr<stream> fwd_stream) {
+#ifdef ENABLE_MKLDNN_THREADPOOL
+    context_.src_mem->set_data_handle(
+        static_cast<void*>(const_cast<T*>(src_data)), *fwd_stream);
+    context_.dst_mem->set_data_handle(static_cast<void*>(dst_data),
+                                      *fwd_stream);
+#else
     context_.src_mem->set_data_handle(
         static_cast<void*>(const_cast<T*>(src_data)));
     context_.dst_mem->set_data_handle(static_cast<void*>(dst_data));
-
+#endif  // ENABLE_MKLDNN_THREADPOOL
 #ifdef ENABLE_MKLDNN_V1
     DCHECK_EQ(context_.fwd_primitives.size(),
               context_.fwd_primitives_args.size());
@@ -263,7 +269,7 @@ class MklEltwiseBwdParams {
 
   MklEltwiseBwdParams(const memory::dims& src_dims,
                       const memory::desc& common_md, algorithm alg_kind,
-                      float alpha, float beta, int forward_input_type)
+                      float alpha, float beta, int forward_input_type = -1)
       : src_dims(src_dims),
         common_md(common_md),
         alg_kind(alg_kind),
@@ -297,12 +303,20 @@ class MklEltwiseBwdPrimitive : public MklPrimitive {
   //   diff_src_data:  output data buffer of diff_src
   void Execute(const T* src_data, const T* diff_dst_data, T* diff_src_data,
                std::shared_ptr<stream> bwd_stream) {
+#ifdef ENABLE_MKLDNN_THREADPOOL
+    context_.src_mem->set_data_handle(
+        static_cast<void*>(const_cast<T*>(src_data)), *bwd_stream);
+    context_.diff_dst_mem->set_data_handle(
+        static_cast<void*>(const_cast<T*>(diff_dst_data)), *bwd_stream);
+    context_.diff_src_mem->set_data_handle(static_cast<void*>(diff_src_data),
+                                           *bwd_stream);
+#else
     context_.src_mem->set_data_handle(
         static_cast<void*>(const_cast<T*>(src_data)));
     context_.diff_dst_mem->set_data_handle(
         static_cast<void*>(const_cast<T*>(diff_dst_data)));
     context_.diff_src_mem->set_data_handle(static_cast<void*>(diff_src_data));
-
+#endif  // ENABLE_MKLDNN_THREADPOOL
 #ifdef ENABLE_MKLDNN_V1
     DCHECK_EQ(context_.bwd_primitives.size(),
               context_.bwd_primitives_args.size());
@@ -630,7 +644,10 @@ class MklReluGradOpBase : public OpKernel {
   virtual int GetDiffSrcIndex() const { return 0; }
   // What is the type of input tensor that grad op receives from forward op --
   // is it 'x' (SRC) or 'y' (DST). For Relu-family, it is 'x', so fwd op SRC.
+
+#ifdef ENABLE_MKLDNN_V1
   virtual int GetTypeOfInputTensorFromFwdOp() const { return MKLDNN_ARG_SRC; }
+#endif
 
   void Compute(OpKernelContext* context) {
     try {
@@ -722,8 +739,16 @@ class MklReluGradOpBase : public OpKernel {
         common_md = src_md;
       }
 
+#ifdef ENABLE_MKLDNN_V1
       MklEltwiseBwdParams<T> bwdParams(src_dims, common_md, alg_kind, alpha_,
                                        beta_, GetTypeOfInputTensorFromFwdOp());
+#else
+      // MKLDNN V0 does not support reusing output of forward op in backward.
+      // So this optimization works only in MKLDNN v1.
+      MklEltwiseBwdParams<T> bwdParams(src_dims, common_md, alg_kind, alpha_,
+                                       beta_);
+#endif  // ENABLE_MKLDNN_V1
+
       MklEltwiseBwdPrimitive<T>* eltwise_bwd =
           MklEltwiseBwdPrimitiveFactory<T>::Get(bwdParams);
 
@@ -948,6 +973,11 @@ class MklEluGradOp
   }
 };
 
+#ifdef ENABLE_MKLDNN_V1
+// Optimized TanhGrad support exists in DNNL1.x only
+// (eltwise_tanh_use_dst_for_bwd). We can still support it with DNNL0.x, but
+// it will not be optimized. So we disable it for DNNL0.x.
+
 template <typename Device, typename T>
 class MklTanhOp : public MklReluOpBase<Device, T, ALGORITHM::eltwise_tanh> {
  public:
@@ -1029,6 +1059,7 @@ class MklTanhGradOp
         (static_cast<T*>(user_g))[0] * (static_cast<T>(1) - tanh * tanh);
   }
 };
+#endif  // ENABLE_MKLDNN_V1
 
 #define RELU6_UPPER_BOUND 6.0f
 template <typename Device, typename T>
@@ -1213,6 +1244,7 @@ TF_CALL_bfloat16(REGISTER_RELU_MKL_SUPPORTED_KERNELS_TYPES);
 TF_CALL_float(REGISTER_ELU_MKL_SUPPORTED_KERNELS_TYPES);
 TF_CALL_bfloat16(REGISTER_ELU_MKL_SUPPORTED_KERNELS_TYPES);
 
+#ifdef ENABLE_MKLDNN_V1
 #define REGISTER_TANH_MKL_SUPPORTED_KERNELS_TYPES(type)        \
   REGISTER_KERNEL_BUILDER(                                     \
       Name("_MklTanh")                                         \
@@ -1228,6 +1260,7 @@ TF_CALL_bfloat16(REGISTER_ELU_MKL_SUPPORTED_KERNELS_TYPES);
       MklTanhGradOp<CPUDevice, type>);
 TF_CALL_float(REGISTER_TANH_MKL_SUPPORTED_KERNELS_TYPES);
 TF_CALL_bfloat16(REGISTER_TANH_MKL_SUPPORTED_KERNELS_TYPES);
+#endif
 
 #define REGISTER_RELU6_MKL_SUPPORTED_KERNELS_TYPES(type)       \
   REGISTER_KERNEL_BUILDER(                                     \
diff --git a/tensorflow/core/kernels/mkl_relu_op_test.cc b/tensorflow/core/kernels/mkl_relu_op_test.cc
index d1fdf7ab4ae..1f3f92373e3 100644
--- a/tensorflow/core/kernels/mkl_relu_op_test.cc
+++ b/tensorflow/core/kernels/mkl_relu_op_test.cc
@@ -121,8 +121,11 @@ static Graph* Activation(const string& op_name, const string& kind,
   BM(OP, 32, 64, 128, 256, cpu); \
   BM(OP, 33, 65, 129, 257, cpu);
 
+#ifdef ENABLE_MKLDNN_V1
+// Optimized MKLDNN TanhGrad support exists in DNNL1.x only.
 TEST_ALL_SIZES(Tanh)
 TEST_ALL_SIZES(TanhGrad)
+#endif  // ENABLE_MKLDNN_V1
 TEST_ALL_SIZES(Relu)
 TEST_ALL_SIZES(ReluGrad)
 TEST_ALL_SIZES(Elu)
diff --git a/tensorflow/core/kernels/mkl_slice_op.cc b/tensorflow/core/kernels/mkl_slice_op.cc
index 4115691c79d..7e293e14d98 100644
--- a/tensorflow/core/kernels/mkl_slice_op.cc
+++ b/tensorflow/core/kernels/mkl_slice_op.cc
@@ -189,9 +189,15 @@ class MklSlicePrimitive : public MklPrimitive {
 
   void Execute(const MklSliceParams& sliceParams,
                std::shared_ptr<stream> slice_stream) {
+#ifdef ENABLE_MKLDNN_THREADPOOL
+    context_.src_mem->set_data_handle(sliceParams.from->get_data_handle(),
+                                      *slice_stream);
+    context_.dst_mem->set_data_handle(sliceParams.to->get_data_handle(),
+                                      *slice_stream);
+#else
     context_.src_mem->set_data_handle(sliceParams.from->get_data_handle());
     context_.dst_mem->set_data_handle(sliceParams.to->get_data_handle());
-
+#endif  // ENABLE_MKLDNN_THREADPOOL
 #ifdef ENABLE_MKLDNN_V1
     execute_primitives(context_.slice_primitives, slice_stream,
                        context_.slice_primitives_args);
diff --git a/tensorflow/core/kernels/mkl_softmax_op.cc b/tensorflow/core/kernels/mkl_softmax_op.cc
index 4d1cf90f28d..2f51573fe13 100644
--- a/tensorflow/core/kernels/mkl_softmax_op.cc
+++ b/tensorflow/core/kernels/mkl_softmax_op.cc
@@ -59,10 +59,16 @@ class MklSoftmaxPrimitive : public MklPrimitive {
   //   dst_data:  output data buffer of dst
   void Execute(const T* src_data, T* dst_data,
                std::shared_ptr<stream> fwd_cpu_stream) {
+#ifdef ENABLE_MKLDNN_THREADPOOL
+    context_.src_mem->set_data_handle(
+        static_cast<void*>(const_cast<T*>(src_data)), *fwd_cpu_stream);
+    context_.dst_mem->set_data_handle(static_cast<void*>(dst_data),
+                                      *fwd_cpu_stream);
+#else
     context_.src_mem->set_data_handle(
         static_cast<void*>(const_cast<T*>(src_data)));
     context_.dst_mem->set_data_handle(static_cast<void*>(dst_data));
-
+#endif  // ENABLE_MKLDNN_THREADPOOL
 #ifdef ENABLE_MKLDNN_V1
     DCHECK_EQ(context_.fwd_primitives.size(), context_.fwd_net_args.size());
     execute_primitives(context_.fwd_primitives, fwd_cpu_stream,
diff --git a/tensorflow/core/kernels/mkl_transpose_op.cc b/tensorflow/core/kernels/mkl_transpose_op.cc
index 77a68afa752..2e5c6d2719b 100644
--- a/tensorflow/core/kernels/mkl_transpose_op.cc
+++ b/tensorflow/core/kernels/mkl_transpose_op.cc
@@ -137,6 +137,7 @@ Status MKLTransposeND(OpKernelContext* context, const Tensor& in_tensor,
     memory::dims out_strides =
         ReorderStrides(CalculateTFStrides(out_dims), perm);
 
+    std::shared_ptr<stream> transpose_stream;
     in.SetUsrMem(in_dims, in_strides, &in_tensor);
     // Output dimensions are same as input dimensions. We adjust the layout
     // using strides.
@@ -144,16 +145,16 @@ Status MKLTransposeND(OpKernelContext* context, const Tensor& in_tensor,
 
     std::vector<primitive> net;
 #ifdef ENABLE_MKLDNN_V1
-    std::shared_ptr<stream> transpose_stream;
     auto* prim = FindOrCreateReorder<T>(in.GetUsrMem(), out.GetUsrMem());
     transpose_stream.reset(CreateStream(context, prim->GetEngine()));
+    in.SetUsrMemDataHandle(&in_tensor, transpose_stream);
+    out.SetUsrMemDataHandle(out_tensor, transpose_stream);
     net.push_back(*(prim->GetPrimitive()));
     std::vector<MemoryArgsMap> net_args;
     net_args.push_back({{MKLDNN_ARG_FROM, *in.GetUsrMem()},
                         {MKLDNN_ARG_TO, *out.GetUsrMem()}});
     execute_primitives(net, transpose_stream, net_args);
 #else
-    std::shared_ptr<stream> transpose_stream;
     transpose_stream.reset(new CPU_STREAM(cpu_engine));
     net.push_back(FindOrCreateReorder<T>(in.GetUsrMem(), out.GetUsrMem()));
     transpose_stream->submit(net).wait();
diff --git a/tensorflow/core/kernels/mlir_generated_cwise_op_gpu_tanh.cu.cc b/tensorflow/core/kernels/mlir_generated_cwise_op_gpu_tanh.cu.cc
index 40dd7c7e49e..a122c5112e6 100644
--- a/tensorflow/core/kernels/mlir_generated_cwise_op_gpu_tanh.cu.cc
+++ b/tensorflow/core/kernels/mlir_generated_cwise_op_gpu_tanh.cu.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include <memory>
+#include <vector>
 
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
@@ -45,9 +46,41 @@ Status CreateKernel(absl::string_view kernel_name, uint64_t num_args,
   return stream_exec->GetKernel(loader_spec, kernel_base.get());
 }
 
-class MlirGenerateTanhOp : public OpKernel {
+struct LaunchConfig {
+  se::BlockDim blockDim;
+  se::ThreadDim threadDim;
+};
+
+LaunchConfig GetLaunchConfiguration(std::vector<uint64> tile_sizes,
+                                    std::vector<uint64> unrolling_factors,
+                                    std::vector<uint64> shape) {
+  LaunchConfig result;
+  // Ensure the vectors are length 3 and pad with ones.
+  tile_sizes.resize(3, 1);
+  unrolling_factors.resize(3, 1);
+  shape.resize(3, 1);
+  // The number of threads is given by the tiling size.
+  result.threadDim = se::ThreadDim(tile_sizes[0], tile_sizes[1], tile_sizes[2]);
+  // We know that the kernel was generated by mapping the three outer-most
+  // dimensions to x,y,z dimensions. So we only need to compute those.
+  std::vector<int> block_dims(3);
+  for (int i = 0; i < 3; ++i) {
+    // Compute the number of grids. We use ceildiv here as we have to allocate
+    // an extra thread/block if the division is not even. The kernel contains
+    // code to handle the boundaries.
+    int number_of_threads =
+        (shape[i] + unrolling_factors[i] - 1) / unrolling_factors[i];
+    int number_of_grids =
+        (number_of_threads + tile_sizes[i] - 1) / tile_sizes[i];
+    block_dims[i] = number_of_grids;
+  }
+  result.blockDim = se::BlockDim(block_dims[0], block_dims[1], block_dims[2]);
+  return result;
+}
+
+class MlirGeneratedTanhOp : public OpKernel {
  public:
-  explicit MlirGenerateTanhOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+  explicit MlirGeneratedTanhOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
 
   void Compute(OpKernelContext* ctx) override {
     auto* stream = ctx->op_device_context()->stream();
@@ -88,11 +121,13 @@ class MlirGenerateTanhOp : public OpKernel {
     args.add_argument<int64_t>(inp.NumElements());
     args.add_argument<int64_t>(1);
 
-    // TODO(b/158649746): Choose block size and thread dim according to the
-    // number of input elements. For now, this supports at most 1024 elements.
+    // This has to be aligned with the configuration that was used when building
+    // the kernels. See the corresponding build rules in `cubin_headers/BUILD`.
+    LaunchConfig config = GetLaunchConfiguration(
+        {256}, {4}, {static_cast<uint64>(inp.NumElements())});
     OP_REQUIRES_OK(
-        ctx, stream->parent()->Launch(stream, se::ThreadDim(inp.NumElements()),
-                                      se::BlockDim(1), *kernel, args));
+        ctx, stream->parent()->Launch(stream, config.threadDim, config.blockDim,
+                                      *kernel, args));
   }
 
  protected:
@@ -103,26 +138,26 @@ class MlirGenerateTanhOp : public OpKernel {
   std::mutex mu_;
 };
 
-class MlirGenerateTanhF16Op : public MlirGenerateTanhOp {
+class MlirGeneratedTanhF16Op : public MlirGeneratedTanhOp {
  public:
-  explicit MlirGenerateTanhF16Op(OpKernelConstruction* ctx)
-      : MlirGenerateTanhOp(ctx) {
+  explicit MlirGeneratedTanhF16Op(OpKernelConstruction* ctx)
+      : MlirGeneratedTanhOp(ctx) {
     cubin_data_ = kTanhF16Kernel;
   }
 };
 
-class MlirGenerateTanhF32Op : public MlirGenerateTanhOp {
+class MlirGeneratedTanhF32Op : public MlirGeneratedTanhOp {
  public:
-  explicit MlirGenerateTanhF32Op(OpKernelConstruction* ctx)
-      : MlirGenerateTanhOp(ctx) {
+  explicit MlirGeneratedTanhF32Op(OpKernelConstruction* ctx)
+      : MlirGeneratedTanhOp(ctx) {
     cubin_data_ = kTanhF32Kernel;
   }
 };
 
-class MlirGenerateTanhF64Op : public MlirGenerateTanhOp {
+class MlirGeneratedTanhF64Op : public MlirGeneratedTanhOp {
  public:
-  explicit MlirGenerateTanhF64Op(OpKernelConstruction* ctx)
-      : MlirGenerateTanhOp(ctx) {
+  explicit MlirGeneratedTanhF64Op(OpKernelConstruction* ctx)
+      : MlirGeneratedTanhOp(ctx) {
     cubin_data_ = kTanhF64Kernel;
   }
 };
@@ -130,11 +165,11 @@ class MlirGenerateTanhF64Op : public MlirGenerateTanhOp {
 
 REGISTER_KERNEL_BUILDER(
     Name("Tanh").Device(DEVICE_GPU).TypeConstraint<Eigen::half>("T"),
-    MlirGenerateTanhF16Op);
+    MlirGeneratedTanhF16Op);
 REGISTER_KERNEL_BUILDER(
     Name("Tanh").Device(DEVICE_GPU).TypeConstraint<float>("T"),
-    MlirGenerateTanhF32Op);
+    MlirGeneratedTanhF32Op);
 REGISTER_KERNEL_BUILDER(
     Name("Tanh").Device(DEVICE_GPU).TypeConstraint<double>("T"),
-    MlirGenerateTanhF64Op);
+    MlirGeneratedTanhF64Op);
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/mlir_generated_op_gpu_tanh_test.cc b/tensorflow/core/kernels/mlir_generated_op_gpu_tanh_test.cc
new file mode 100644
index 00000000000..39c1d709b1e
--- /dev/null
+++ b/tensorflow/core/kernels/mlir_generated_op_gpu_tanh_test.cc
@@ -0,0 +1,85 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cmath>
+#include <memory>
+#include <vector>
+
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/device_factory.h"
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace {
+
+class MlirGeneratedOpGpuTanhTest : public OpsTestBase {
+ protected:
+  void SetUp() override {
+    std::unique_ptr<tensorflow::Device> device_gpu(
+        tensorflow::DeviceFactory::NewDevice("GPU", {},
+                                             "/job:a/replica:0/task:0"));
+    SetDevice(tensorflow::DEVICE_GPU, std::move(device_gpu));
+  }
+  template <typename T, typename RT = T>
+  void RunTanhOp(std::initializer_list<T> input) {
+    TensorShape shape({2, 7});
+    TF_ASSERT_OK(NodeDefBuilder("tanh_op", "Tanh")
+                     .Input(FakeInput(DataTypeToEnum<T>::v()))
+                     .Attr("T", DataTypeToEnum<T>::v())
+                     .Finalize(node_def()));
+
+    TF_ASSERT_OK(InitOp());
+    AddInputFromArray<T>(shape, input);
+    TF_ASSERT_OK(RunOpKernel());
+
+    Tensor expected_tensor(allocator(), DataTypeToEnum<T>::value, shape);
+    std::vector<T> expected;
+    expected.reserve(input.size());
+    for (const T& inp : input) {
+      expected.push_back(static_cast<T>(std::tanh(static_cast<RT>(inp))));
+    }
+    test::FillValues<T>(&expected_tensor, expected);
+    test::ExpectClose(expected_tensor, *GetOutput(0));
+  }
+};
+
+TEST_F(MlirGeneratedOpGpuTanhTest, TanhFloat) {
+  RunTanhOp<float>({-18.0f, -9.0f, -1e-6f, -0.0f, 0.0f, 1e-6, 0.1f, 0.2f, 0.3f,
+                    0.5f, 0.7f, 0.9f, 9.0f, 18.0f});
+}
+
+TEST_F(MlirGeneratedOpGpuTanhTest, TanhDouble) {
+  RunTanhOp<double>({-18.0, -9.0, -1e-6, -0.0, 0.0, 1e-6, 0.1, 0.2, 0.3, 0.5,
+                     0.7, 0.9, 9.0, 18.0});
+}
+
+TEST_F(MlirGeneratedOpGpuTanhTest, TanhHalf) {
+  RunTanhOp<Eigen::half, float>(
+      {static_cast<Eigen::half>(-18.0), static_cast<Eigen::half>(-9.0),
+       static_cast<Eigen::half>(-1e-6), static_cast<Eigen::half>(-0.0),
+       static_cast<Eigen::half>(0.0), static_cast<Eigen::half>(1e-6),
+       static_cast<Eigen::half>(0.1), static_cast<Eigen::half>(0.2),
+       static_cast<Eigen::half>(0.3), static_cast<Eigen::half>(0.5),
+       static_cast<Eigen::half>(0.7), static_cast<Eigen::half>(0.9),
+       static_cast<Eigen::half>(9.0), static_cast<Eigen::half>(18.0)});
+}
+
+}  // namespace
+}  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/non_max_suppression_op.cu.cc b/tensorflow/core/kernels/non_max_suppression_op.cu.cc
index 53559b20419..c2cae2ab212 100644
--- a/tensorflow/core/kernels/non_max_suppression_op.cu.cc
+++ b/tensorflow/core/kernels/non_max_suppression_op.cu.cc
@@ -28,25 +28,6 @@ limitations under the License.
 #include "tensorflow/core/util/gpu_launch_config.h"
 #include "tensorflow/stream_executor/stream_executor.h"
 
-#define TF_RETURN_IF_CUDA_ERROR(result)                   \
-  do {                                                    \
-    cudaError_t error(result);                            \
-    if (!SE_PREDICT_TRUE(error == cudaSuccess)) {         \
-      return errors::Internal("Cuda call failed with ",   \
-                              cudaGetErrorString(error)); \
-    }                                                     \
-  } while (0)
-
-#define TF_OP_REQUIRES_CUDA_SUCCESS(context, result)                   \
-  do {                                                                 \
-    cudaError_t error(result);                                         \
-    if (!SE_PREDICT_TRUE(error == cudaSuccess)) {                      \
-      context->SetStatus(errors::Internal("Cuda call failed with",     \
-                                          cudaGetErrorString(error))); \
-      return;                                                          \
-    }                                                                  \
-  } while (0)
-
 struct __align__(16) Box {
   float x1, y1, x2, y2;
 };
diff --git a/tensorflow/core/kernels/parameterized_truncated_normal_op.cc b/tensorflow/core/kernels/parameterized_truncated_normal_op.cc
index 4eab9052830..ba1fd280ce7 100644
--- a/tensorflow/core/kernels/parameterized_truncated_normal_op.cc
+++ b/tensorflow/core/kernels/parameterized_truncated_normal_op.cc
@@ -32,6 +32,7 @@ limitations under the License.
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/kernels/stateless_random_ops.h"
 #include "tensorflow/core/lib/random/random_distributions.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/util/guarded_philox_random.h"
@@ -67,10 +68,10 @@ struct TruncatedNormalFunctor<CPUDevice, T> {
     const T kStdDevsInsideBoundsToUseRandnSampler = T(1.3);
     auto worker_threads = *(ctx->device()->tensorflow_cpu_worker_threads());
 
-    auto DoWork = [samples_per_batch, num_elements, &ctx, &means, &stddevs,
-                   &minvals, &maxvals, &gen, &output,
-                   kStdDevsInsideBoundsToUseRandnSampler](int start_batch,
-                                                          int limit_batch) {
+    auto do_work = [samples_per_batch, num_elements, &ctx, &means, &stddevs,
+                    &minvals, &maxvals, &gen, &output,
+                    kStdDevsInsideBoundsToUseRandnSampler](int start_batch,
+                                                           int limit_batch) {
       // Capturing "gen" by-value would only make a copy for the _shared_
       // lambda.  Since we want to let each worker have its own copy, we pass
       // "gen" by reference and explicitly do a copy assignment here.
@@ -80,9 +81,9 @@ struct TruncatedNormalFunctor<CPUDevice, T> {
       // The sample from each iteration uses 2 random numbers.
       gen_copy.Skip(start_batch * 2 * kMaxIterations * (samples_per_batch + 3) /
                     4);
-      typedef random::UniformDistribution<random::PhiloxRandom, T> Uniform;
+      using Uniform = random::UniformDistribution<random::PhiloxRandom, T>;
       Uniform dist;
-      typedef random::NormalDistribution<random::PhiloxRandom, T> Normal;
+      using Normal = random::NormalDistribution<random::PhiloxRandom, T>;
       Normal normal_dist;
 
       // Vectorized intermediate calculations for uniform rejection sampling.
@@ -112,7 +113,7 @@ struct TruncatedNormalFunctor<CPUDevice, T> {
                          Eigen::numext::isfinite(maxval)),
                     errors::InvalidArgument("Invalid parameters"));
 
-        int numIterations = 0;
+        int num_iterations = 0;
 
         // If possible, make one-sided bound be the lower bound, or make both
         // bounds positive. Otherwise, the bounds are on either side of the
@@ -160,10 +161,10 @@ struct TruncatedNormalFunctor<CPUDevice, T> {
                 if (sample >= limit_sample) {
                   break;
                 }
-                numIterations = 0;
+                num_iterations = 0;
               } else {
-                numIterations++;
-                if (numIterations > kMaxIterations) {
+                num_iterations++;
+                if (num_iterations > kMaxIterations) {
                   // This should never occur because this sampler should
                   // (by the selection criteria above) be used if at least 3
                   // standard deviations of one side of the distribution
@@ -201,7 +202,7 @@ struct TruncatedNormalFunctor<CPUDevice, T> {
             const auto u = dist(&gen_copy);
             for (int i = 0; i < size; i++) {
               auto accept = u[i] <= Eigen::numext::exp(g[i]);
-              if (accept || numIterations + 1 >= kMaxIterations) {
+              if (accept || num_iterations + 1 >= kMaxIterations) {
                 // Accept the sample z.
                 // If we run out of iterations, just use the current uniform
                 // sample, but emit a warning.
@@ -223,9 +224,9 @@ struct TruncatedNormalFunctor<CPUDevice, T> {
                 if (sample >= limit_sample) {
                   break;
                 }
-                numIterations = 0;
+                num_iterations = 0;
               } else {
-                numIterations++;
+                num_iterations++;
               }
             }
           }
@@ -248,7 +249,7 @@ struct TruncatedNormalFunctor<CPUDevice, T> {
               const T u = rand[i];
               i++;
               auto accept = (u <= g && z < normMax);
-              if (accept || numIterations + 1 >= kMaxIterations) {
+              if (accept || num_iterations + 1 >= kMaxIterations) {
                 if (!accept) {
                   LOG(ERROR) << "TruncatedNormal exponential distribution "
                              << "rejection sampler exceeds max iterations. "
@@ -263,9 +264,9 @@ struct TruncatedNormalFunctor<CPUDevice, T> {
                 if (sample >= limit_sample) {
                   break;
                 }
-                numIterations = 0;
+                num_iterations = 0;
               } else {
-                numIterations++;
+                num_iterations++;
               }
             }
           }
@@ -305,7 +306,297 @@ struct TruncatedNormalFunctor<CPUDevice, T> {
     const int64 batchCost =
         batchInitCost + uniformRejectionSamplingCost * 2 * samples_per_batch;
     Shard(worker_threads.num_threads, worker_threads.workers, num_batches,
-          batchCost, DoWork);
+          batchCost, do_work);
+  }
+};
+
+template <typename T>
+struct TruncatedNormalFunctorV2<CPUDevice, T> {
+  void operator()(OpKernelContext* ctx, const CPUDevice& d, int64 num_batches,
+                  int64 samples_per_batch, int64 num_elements,
+                  const BCastList<4>& bcast,
+                  typename TTypes<T>::ConstFlat means,
+                  typename TTypes<T>::ConstFlat stddevs,
+                  typename TTypes<T>::ConstFlat minvals,
+                  typename TTypes<T>::ConstFlat maxvals,
+                  const random::PhiloxRandom& gen,
+                  typename TTypes<T>::Flat output) {
+    // The randn rejection sampling is used when the mean and at least this many
+    // standard deviations are inside the bounds.
+    // The uniform proposal samplers become less efficient as the bounds are
+    // further from the mean, the reverse is true for the randn sampler.
+    // This number was chosen by empirical benchmarking. If modified, the
+    // benchmarks in parameterized_truncated_normal_op_test should also be
+    // changed.
+    const T kStdDevsInsideBoundsToUseRandnSampler = T(1.3);
+    auto worker_threads = *(ctx->device()->tensorflow_cpu_worker_threads());
+
+    auto do_work = [num_batches, samples_per_batch, &ctx, &bcast, &means,
+                    &stddevs, &minvals, &maxvals, &gen, &output,
+                    kStdDevsInsideBoundsToUseRandnSampler](int start_output,
+                                                           int limit_output) {
+      // Capturing "gen" by-value would only make a copy for the _shared_
+      // lambda.  Since we want to let each worker have its own copy, we pass
+      // "gen" by reference and explicitly do a copy assignment here.
+      random::PhiloxRandom gen_copy = gen;
+      using Uniform = random::UniformDistribution<random::PhiloxRandom, T>;
+      Uniform dist;
+      using Normal = random::NormalDistribution<random::PhiloxRandom, T>;
+      Normal normal_dist;
+      // Skip takes units of 128 bits. The Uniform::kResultElementCount - 1
+      // is so rounding doesn't lead to
+      // us using the same state in different workloads.
+      // The sample from each iteration uses 2 random numbers.
+      gen_copy.Skip((start_output * 2 * kMaxIterations +
+                     Uniform::kResultElementCount - 1) /
+                    Uniform::kResultElementCount);
+
+      // Vectorized intermediate calculations for uniform rejection sampling.
+      // We always generate at most 4 samples.
+      Eigen::array<T, Uniform::kResultElementCount> z;
+      Eigen::array<T, Uniform::kResultElementCount> g;
+
+      const bool should_bcast = bcast.IsBroadcastingRequired();
+      const auto& means_batch_indices = bcast.batch_indices(0);
+      const auto& stddevs_batch_indices = bcast.batch_indices(1);
+      const auto& minvals_batch_indices = bcast.batch_indices(2);
+      const auto& maxvals_batch_indices = bcast.batch_indices(3);
+      auto output_flat = output.data();
+
+      // We partition work across batches and then across samples
+      // per batch member, to avoid extra work.
+      for (int64 output_idx = start_output; output_idx < limit_output;
+           // output_idx is incremented with the inner loops below.
+      ) {
+        int64 batch_idx = output_idx / samples_per_batch;
+        // The output layout is [samples_per_batch, num_batches]. Thus
+        // the output address is sample_idx * num_batches + batch_idx.
+        // Below, code will index at output_batch_offset[sample_idx *
+        // num_batches] matching this.
+        T* const output_batch_offset = output_flat + batch_idx;
+        // Generate batch counts from BCast, as it has the right indices to loop
+        // over.
+        T mean, stddev, minval, maxval;
+        if (should_bcast) {
+          mean = means(means_batch_indices[batch_idx]);
+          stddev = stddevs(stddevs_batch_indices[batch_idx]);
+          minval = minvals(minvals_batch_indices[batch_idx]);
+          maxval = maxvals(maxvals_batch_indices[batch_idx]);
+        } else {
+          mean = means(batch_idx);
+          stddev = stddevs(batch_idx);
+          minval = minvals(batch_idx);
+          maxval = maxvals(batch_idx);
+        }
+
+        // On GPU, this check will just fill samples with NAN if it fails.
+        OP_REQUIRES(ctx,
+                    stddev > T(0) && minval < maxval &&
+                        (Eigen::numext::isfinite(minval) ||
+                         Eigen::numext::isfinite(maxval)),
+                    errors::InvalidArgument("Invalid parameters"));
+
+        int num_iterations = 0;
+
+        // If possible, make one-sided bound be the lower bound, or make both
+        // bounds positive. Otherwise, the bounds are on either side of the
+        // mean.
+        if ((Eigen::numext::isinf(minval) && minval < T(0)) || maxval < mean) {
+          // Reverse all calculations. normMin and normMax will be flipped.
+          std::swap(minval, maxval);
+          stddev = -stddev;
+        }
+
+        // Calculate normalized samples, then convert them.
+        const T normMin = (minval - mean) / stddev;
+        const T normMax = (maxval - mean) / stddev;
+
+        // Determine the method to use.
+        const T sqrtFactor = Eigen::numext::sqrt((normMin * normMin) + T(4));
+        const T cutoff =
+            T(2) *
+            Eigen::numext::exp(T(0.5) +
+                               (normMin * (normMin - sqrtFactor)) / T(4)) /
+            (normMin + sqrtFactor);
+        const T diff = normMax - normMin;
+
+        if (((normMin < -kStdDevsInsideBoundsToUseRandnSampler) &&
+             (normMax >= T(0.))) ||
+            ((normMax > kStdDevsInsideBoundsToUseRandnSampler) &&
+             (normMin <= T(0.)))) {
+          // If the bounds are a least 3 standard deviations from the mean
+          // on at least one side then we rejection sample by sampling
+          // from the normal distribution and rejecting samples outside
+          // the bounds.
+          // Under this condition the acceptance rate per iteration should
+          // always be ~ 50%. This sampler is more efficient (and more
+          // numerically stable when one or both bounds is far from the mean).
+          for (int64 sample_idx = output_idx % samples_per_batch;
+               sample_idx < samples_per_batch && output_idx < limit_output;) {
+            const auto randn_sample = normal_dist(&gen_copy);
+            const int size = randn_sample.size();
+            for (int i = 0; i < size; ++i) {
+              if ((randn_sample[i] >= normMin) &&
+                  (randn_sample[i] <= normMax)) {
+                output_batch_offset[sample_idx * num_batches] =
+                    randn_sample[i] * stddev + mean;
+                ++sample_idx;
+                ++output_idx;
+                if (sample_idx >= samples_per_batch ||
+                    output_idx >= limit_output) {
+                  break;
+                }
+                num_iterations = 0;
+              } else {
+                ++num_iterations;
+                if (num_iterations > kMaxIterations) {
+                  // This should never occur because this sampler should
+                  // (by the selection criteria above) be used if at least 3
+                  // standard deviations of one side of the distribution
+                  // is within the limits (so acceptance probability per
+                  // iterations >~ 1/2 per iteration).
+                  LOG(ERROR) << "TruncatedNormal randn rejection sampler "
+                             << "exceeded maximum iterations for "
+                             << "normMin=" << normMin << " normMax=" << normMax
+                             << " kMaxIterations=" << kMaxIterations;
+                  ctx->SetStatus(errors::Internal(
+                      "TruncatedNormal randn rejection sampler failed to accept"
+                      " a sample."));
+                  return;
+                }
+              }
+            }
+          }
+        } else if (diff < cutoff) {
+          // Sample from a uniform distribution on [normMin, normMax].
+
+          const T plusFactor = (normMin < T(0)) ? T(0) : normMin * normMin;
+
+          for (int64 sample_idx = output_idx % samples_per_batch;
+               sample_idx < samples_per_batch && output_idx < limit_output;) {
+            const auto rand = dist(&gen_copy);
+            const int size = rand.size();
+            // NOTE(ringwalt): These loops seem to only generate packed AVX
+            // instructions for float32.
+            for (int i = 0; i < size; i++) {
+              z[i] = rand[i] * diff + normMin;
+              g[i] = (plusFactor - z[i] * z[i]) / T(2.0);
+            }
+
+            const auto u = dist(&gen_copy);
+            for (int i = 0; i < size; i++) {
+              auto accept = u[i] <= Eigen::numext::exp(g[i]);
+              if (accept || num_iterations + 1 >= kMaxIterations) {
+                // Accept the sample z.
+                // If we run out of iterations, just use the current uniform
+                // sample, but emit a warning.
+                // TODO(jjhunt) For small entropies (relative to the bounds),
+                // this sampler is poor and may take many iterations since
+                // the proposal distribution is the uniform distribution
+                // U(lower_bound, upper_bound).
+                if (!accept) {
+                  LOG(ERROR) << "TruncatedNormal uniform rejection sampler "
+                             << "exceeded max iterations. Sample may contain "
+                             << "outliers.";
+                  ctx->SetStatus(errors::Internal(
+                      "TruncatedNormal uniform rejection sampler failed to "
+                      " accept a sample."));
+                  return;
+                }
+                output_batch_offset[sample_idx * num_batches] =
+                    z[i] * stddev + mean;
+                ++sample_idx;
+                ++output_idx;
+                if (sample_idx >= samples_per_batch ||
+                    output_idx >= limit_output) {
+                  break;
+                }
+                num_iterations = 0;
+              } else {
+                num_iterations++;
+              }
+            }
+          }
+        } else {
+          // Sample from an exponential distribution with alpha maximizing
+          // acceptance probability, offset by normMin from the origin.
+          // Accept only if less than normMax.
+          const T alpha =
+              (normMin + Eigen::numext::sqrt((normMin * normMin) + T(4))) /
+              T(2);
+          for (int64 sample_idx = output_idx % samples_per_batch;
+               sample_idx < samples_per_batch && output_idx < limit_output;) {
+            auto rand = dist(&gen_copy);
+            const int size = rand.size();
+            int i = 0;
+            while (i < size) {
+              const T z = -Eigen::numext::log(rand[i]) / alpha + normMin;
+              i++;
+              const T x = normMin < alpha ? alpha - z : normMin - alpha;
+              const T g = Eigen::numext::exp(-x * x / T(2.0));
+              const T u = rand[i];
+              i++;
+              auto accept = (u <= g && z < normMax);
+              if (accept || num_iterations + 1 >= kMaxIterations) {
+                if (!accept) {
+                  LOG(ERROR) << "TruncatedNormal exponential distribution "
+                             << "rejection sampler exceeds max iterations. "
+                             << "Sample may contain outliers.";
+                  ctx->SetStatus(errors::Internal(
+                      "TruncatedNormal exponential distribution rejection"
+                      " sampler failed to accept a sample."));
+                  return;
+                }
+                output_batch_offset[sample_idx * num_batches] =
+                    z * stddev + mean;
+                ++sample_idx;
+                ++output_idx;
+                if (sample_idx >= samples_per_batch ||
+                    output_idx >= limit_output) {
+                  break;
+                }
+                num_iterations = 0;
+              } else {
+                num_iterations++;
+              }
+            }
+          }
+        }
+      }
+    };
+    // The cost of the initial calculations for the batch.
+    const int64 batchInitCost =
+        // normMin, normMax
+        (Eigen::TensorOpCost::AddCost<T>() +
+         Eigen::TensorOpCost::MulCost<T>()) *
+            2
+        // sqrtFactor
+        + Eigen::TensorOpCost::AddCost<T>() +
+        Eigen::TensorOpCost::MulCost<T>() +
+        Eigen::internal::functor_traits<
+            Eigen::internal::scalar_sqrt_op<T>>::Cost
+        // cutoff
+        + Eigen::TensorOpCost::MulCost<T>() * 4 +
+        Eigen::internal::functor_traits<Eigen::internal::scalar_exp_op<T>>::Cost
+        // diff
+        + Eigen::TensorOpCost::AddCost<T>();
+    const int64 uniformSampleCost =
+        random::PhiloxRandom::kElementCost +
+        random::UniformDistribution<random::PhiloxRandom, T>::kElementCost;
+    // The cost of a single uniform sampling round.
+    const int64 uniformRejectionSamplingCost =
+        uniformSampleCost + Eigen::TensorOpCost::MulCost<T>() +
+        Eigen::TensorOpCost::AddCost<T>() +
+        Eigen::TensorOpCost::MulCost<T>() * 2 +
+        Eigen::TensorOpCost::AddCost<T>() + uniformSampleCost +
+        Eigen::internal::functor_traits<
+            Eigen::internal::scalar_exp_op<T>>::Cost +
+        Eigen::TensorOpCost::MulCost<T>() + Eigen::TensorOpCost::AddCost<T>();
+    // Estimate the cost for an entire batch.
+    // Assume we use uniform sampling, and accept the 2nd sample on average.
+    const int64 batchCost = batchInitCost + uniformRejectionSamplingCost * 2;
+    Shard(worker_threads.num_threads, worker_threads.workers, num_elements,
+          batchCost, do_work);
   }
 };
 
@@ -436,13 +727,113 @@ class ParameterizedTruncatedNormalOp : public OpKernel {
   TF_DISALLOW_COPY_AND_ASSIGN(ParameterizedTruncatedNormalOp);
 };
 
+// Samples from a truncated normal distribution, using the given parameters.
+template <typename Device, typename T>
+class StatelessParameterizedTruncatedNormal : public OpKernel {
+  // Reshape batches so each batch is this size if possible.
+  static const int32 kDesiredBatchSize = 100;
+
+ public:
+  explicit StatelessParameterizedTruncatedNormal(OpKernelConstruction* context)
+      : OpKernel(context) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor& shape_tensor = ctx->input(0);
+    const Tensor& seed_tensor = ctx->input(1);
+    const Tensor& means_tensor = ctx->input(2);
+    const Tensor& stddevs_tensor = ctx->input(3);
+    const Tensor& minvals_tensor = ctx->input(4);
+    const Tensor& maxvals_tensor = ctx->input(5);
+
+    OP_REQUIRES(ctx, seed_tensor.dims() == 1 && seed_tensor.dim_size(0) == 2,
+                errors::InvalidArgument("seed must have shape [2], not ",
+                                        seed_tensor.shape().DebugString()));
+
+    tensorflow::BCastList<4> bcast(
+        {means_tensor.shape().dim_sizes(), stddevs_tensor.shape().dim_sizes(),
+         minvals_tensor.shape().dim_sizes(),
+         maxvals_tensor.shape().dim_sizes()},
+        /*fewer_dims_optimization=*/false,
+        /*return_flattened_batch_indices=*/true);
+
+    OP_REQUIRES(ctx, bcast.IsValid(),
+                errors::InvalidArgument(
+                    "means, stddevs, minvals, maxvals must have compatible "
+                    "batch dimensions: ",
+                    means_tensor.shape().DebugString(), " vs. ",
+                    stddevs_tensor.shape().DebugString(), " vs. ",
+                    minvals_tensor.shape().DebugString(), " vs. ",
+                    maxvals_tensor.shape().DebugString()));
+
+    // Let's check that the shape tensor dominates the broadcasted tensor.
+    TensorShape bcast_shape = BCast::ToShape(bcast.output_shape());
+    OP_REQUIRES(
+        ctx, TensorShapeUtils::IsVector(shape_tensor.shape()),
+        errors::InvalidArgument("Input shape should be a vector, got shape: ",
+                                shape_tensor.shape().DebugString()));
+    TensorShape output_shape;
+    if (shape_tensor.dtype() == DataType::DT_INT32) {
+      OP_REQUIRES_OK(ctx, TensorShapeUtils::MakeShape(shape_tensor.vec<int32>(),
+                                                      &output_shape));
+    } else {
+      OP_REQUIRES_OK(ctx, TensorShapeUtils::MakeShape(shape_tensor.vec<int64>(),
+                                                      &output_shape));
+    }
+    OP_REQUIRES(ctx, TensorShapeUtils::EndsWith(output_shape, bcast_shape),
+                errors::InvalidArgument(
+                    "Shape passed in must end with broadcasted shape."));
+
+    int64 samples_per_batch = 1;
+    const int64 num_sample_dims =
+        (shape_tensor.dim_size(0) - bcast.output_shape().size());
+    for (int64 i = 0; i < num_sample_dims; ++i) {
+      samples_per_batch *= output_shape.dim_size(i);
+    }
+    int64 num_batches = 1;
+    for (int64 i = num_sample_dims; i < shape_tensor.dim_size(0); ++i) {
+      num_batches *= output_shape.dim_size(i);
+    }
+    const int64 num_elements = num_batches * samples_per_batch;
+
+    Tensor* samples_tensor;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, output_shape, &samples_tensor));
+
+    auto truncFunctor = functor::TruncatedNormalFunctorV2<Device, T>();
+    // Each worker has the same fudge factor, so use it here.
+    random::PhiloxRandom::Key key;
+    random::PhiloxRandom::ResultType counter;
+    OP_REQUIRES_OK(ctx, GenerateKey(seed_tensor, &key, &counter));
+
+    auto philox = random::PhiloxRandom(counter, key);
+
+    truncFunctor(ctx, ctx->eigen_device<Device>(), num_batches,
+                 samples_per_batch, num_elements, bcast, means_tensor.flat<T>(),
+                 stddevs_tensor.flat<T>(), minvals_tensor.flat<T>(),
+                 maxvals_tensor.flat<T>(), philox, samples_tensor->flat<T>());
+  }
+
+ private:
+  TF_DISALLOW_COPY_AND_ASSIGN(StatelessParameterizedTruncatedNormal);
+};
+
 }  // namespace
 
-#define REGISTER(TYPE)                                         \
-  REGISTER_KERNEL_BUILDER(Name("ParameterizedTruncatedNormal") \
-                              .Device(DEVICE_CPU)              \
-                              .TypeConstraint<TYPE>("dtype"),  \
-                          ParameterizedTruncatedNormalOp<CPUDevice, TYPE>)
+#define REGISTER(TYPE)                                                     \
+  REGISTER_KERNEL_BUILDER(Name("ParameterizedTruncatedNormal")             \
+                              .Device(DEVICE_CPU)                          \
+                              .TypeConstraint<TYPE>("dtype"),              \
+                          ParameterizedTruncatedNormalOp<CPUDevice, TYPE>) \
+  REGISTER_KERNEL_BUILDER(                                                 \
+      Name("StatelessParameterizedTruncatedNormal")                        \
+          .HostMemory("shape")                                             \
+          .HostMemory("seed")                                              \
+          .HostMemory("means")                                             \
+          .HostMemory("stddevs")                                           \
+          .HostMemory("minvals")                                           \
+          .HostMemory("maxvals")                                           \
+          .Device(DEVICE_CPU)                                              \
+          .TypeConstraint<TYPE>("dtype"),                                  \
+      StatelessParameterizedTruncatedNormal<CPUDevice, TYPE>)
 
 TF_CALL_half(REGISTER);
 TF_CALL_float(REGISTER);
diff --git a/tensorflow/core/kernels/parameterized_truncated_normal_op.h b/tensorflow/core/kernels/parameterized_truncated_normal_op.h
index c919a22c7b0..ee7fb7bf605 100644
--- a/tensorflow/core/kernels/parameterized_truncated_normal_op.h
+++ b/tensorflow/core/kernels/parameterized_truncated_normal_op.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/lib/random/random_distributions.h"
+#include "tensorflow/core/util/bcast.h"
 
 namespace tensorflow {
 
@@ -44,6 +45,21 @@ struct TruncatedNormalFunctor {
                   typename TTypes<T>::Flat output);
 };
 
+// This version supports broadcasting of the arguments, as well as puts
+// the sample dimension on the left.
+template <typename Device, typename T>
+struct TruncatedNormalFunctorV2 {
+  void operator()(OpKernelContext* ctx, const Device& d, int64 num_batches,
+                  int64 samples_per_batch, int64 num_elements,
+                  const BCastList<4>& bcast,
+                  typename TTypes<T>::ConstFlat means,
+                  typename TTypes<T>::ConstFlat stddevs,
+                  typename TTypes<T>::ConstFlat minvals,
+                  typename TTypes<T>::ConstFlat maxvals,
+                  const random::PhiloxRandom& gen,
+                  typename TTypes<T>::Flat output);
+};
+
 }  // namespace functor
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/ragged_gather_op.cc b/tensorflow/core/kernels/ragged_gather_op.cc
index 88c0d1ebd69..3bf82cba050 100644
--- a/tensorflow/core/kernels/ragged_gather_op.cc
+++ b/tensorflow/core/kernels/ragged_gather_op.cc
@@ -296,8 +296,6 @@ TF_CALL_tstring(REGISTER_CPU_KERNEL);
 TF_CALL_QUANTIZED_TYPES(REGISTER_CPU_KERNEL);
 TF_CALL_quint16(REGISTER_CPU_KERNEL);
 TF_CALL_qint16(REGISTER_CPU_KERNEL);
-TF_CALL_uint32(REGISTER_CPU_KERNEL);
-TF_CALL_uint64(REGISTER_CPU_KERNEL);
 #undef REGISTER_CPU_KERNEL
 #undef REGISTER_CPU_KERNEL_WITH_INDEX_TYPE
 
diff --git a/tensorflow/core/kernels/ragged_tensor_from_variant_op.cc b/tensorflow/core/kernels/ragged_tensor_from_variant_op.cc
index f83bcb38c6c..ad0712e6fd0 100644
--- a/tensorflow/core/kernels/ragged_tensor_from_variant_op.cc
+++ b/tensorflow/core/kernels/ragged_tensor_from_variant_op.cc
@@ -308,8 +308,6 @@ TF_CALL_tstring(REGISTER_KERNELS);
 TF_CALL_QUANTIZED_TYPES(REGISTER_KERNELS);
 TF_CALL_quint16(REGISTER_KERNELS);
 TF_CALL_qint16(REGISTER_KERNELS);
-TF_CALL_uint32(REGISTER_KERNELS);
-TF_CALL_uint64(REGISTER_KERNELS);
 #undef REGISTER_KERNELS
 #undef REGISTER_KERNELS_WITH_SPLIT_TYPE
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/ragged_tensor_to_tensor_op.cc b/tensorflow/core/kernels/ragged_tensor_to_tensor_op.cc
index d729c43f25a..9ae5d7ffbdc 100644
--- a/tensorflow/core/kernels/ragged_tensor_to_tensor_op.cc
+++ b/tensorflow/core/kernels/ragged_tensor_to_tensor_op.cc
@@ -561,8 +561,6 @@ TF_CALL_string(REGISTER_CPU_KERNEL);
 TF_CALL_QUANTIZED_TYPES(REGISTER_CPU_KERNEL);
 TF_CALL_quint16(REGISTER_CPU_KERNEL);
 TF_CALL_qint16(REGISTER_CPU_KERNEL);
-TF_CALL_uint32(REGISTER_CPU_KERNEL);
-TF_CALL_uint64(REGISTER_CPU_KERNEL);
 
 #undef REGISTER_CPU_KERNEL
 
diff --git a/tensorflow/core/kernels/ragged_tensor_to_variant_op.cc b/tensorflow/core/kernels/ragged_tensor_to_variant_op.cc
index 7a5ae1c6240..64c372b005e 100644
--- a/tensorflow/core/kernels/ragged_tensor_to_variant_op.cc
+++ b/tensorflow/core/kernels/ragged_tensor_to_variant_op.cc
@@ -213,8 +213,6 @@ TF_CALL_tstring(REGISTER_KERNELS);
 TF_CALL_QUANTIZED_TYPES(REGISTER_KERNELS);
 TF_CALL_quint16(REGISTER_KERNELS);
 TF_CALL_qint16(REGISTER_KERNELS);
-TF_CALL_uint32(REGISTER_KERNELS);
-TF_CALL_uint64(REGISTER_KERNELS);
 #undef REGISTER_KERNELS
 #undef REGISTER_KERNELS_WITH_SPLIT_TYPE
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/redux_functor.h b/tensorflow/core/kernels/redux_functor.h
index 30038c62dbd..e07fa5364f1 100644
--- a/tensorflow/core/kernels/redux_functor.h
+++ b/tensorflow/core/kernels/redux_functor.h
@@ -57,7 +57,8 @@ struct ReduceOuterDimensions {
     if (1 == outer_dim) {
       // Nothing to do but passing input to output.
       output->template flat<OutputT>() =
-          input.template flat<OutputT>().reshape(output_dims);
+          input.template flat<InputT>().template cast<OutputT>().reshape(
+              output_dims);
       return;
     }
 
@@ -226,7 +227,8 @@ struct ReduceMiddleDimensions {
     if ((1 == inner_dim * outer_dim)) {
       // Nothing to do.
       output->template flat<OutputT>() =
-          input.template flat<OutputT>().reshape(output_dims);
+          input.template flat<InputT>().template cast<OutputT>().reshape(
+              output_dims);
       return;
     } else if (1 == inner_dim) {
       // Equivalent to ReduceOuterDimensions.
diff --git a/tensorflow/core/kernels/relu_op_gpu.cu.cc b/tensorflow/core/kernels/relu_op_gpu.cu.cc
index 27fd5f64249..6ce56ca6fed 100644
--- a/tensorflow/core/kernels/relu_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/relu_op_gpu.cu.cc
@@ -35,6 +35,7 @@ namespace tensorflow {
 
 typedef Eigen::GpuDevice GPUDevice;
 
+static constexpr int VectorSizeElements = 8;
 namespace functor {
 
 // This kernel computes ReluGrad by processing one half2, two fp16, at a time.
@@ -93,6 +94,66 @@ __global__ void ReluGradHalfKernel(const Eigen::half* __restrict__ gradient,
   }
 }
 
+__global__ void ReluGradHalfKernelVector(
+    const Eigen::half* __restrict__ gradient,
+    const Eigen::half* __restrict__ feature, Eigen::half* __restrict__ backprop,
+    int32 count) {
+  int32 half8_count = count / VectorSizeElements;
+  int32 index = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if (index < half8_count) {
+    // Cast to xx_h8 for vector load and store.
+    float4 gradient_h8 = reinterpret_cast<const float4*>(gradient)[index];
+    float4 feature_h8 = reinterpret_cast<const float4*>(feature)[index];
+    float4* p_backprop_h8 = reinterpret_cast<float4*>(backprop) + index;
+
+    half2* gradient_h2 = reinterpret_cast<half2*>(&gradient_h8);
+    half2* feature_h2 = reinterpret_cast<half2*>(&feature_h8);
+    float4 backprop_h8;
+    half2* p_backprop_h2 = reinterpret_cast<half2*>(&backprop_h8);
+
+    // Fast path, when half2 primitives are available.
+#if __CUDA_ARCH__ >= 530
+    const half2 kZeroH2 = __float2half2_rn(0.f);
+#endif
+    for (int i = 0; i < VectorSizeElements / 2; i++) {
+#if __CUDA_ARCH__ >= 530
+      // mask = (feature > 0)
+      half2 mask_h2 = __hgt2(feature_h2[i], kZeroH2);
+      // backprop = mask * gradient
+      half2 backprop_h2 = __hmul2(mask_h2, gradient_h2[i]);
+#else
+      // Fall back: convert half2 to float2 for processing.
+      float2 feature_f2 = __half22float2(feature_h2[i]);
+      float2 gradient_f2 = __half22float2(gradient_h2[i]);
+      float2 backprop_f2 =
+          make_float2((feature_f2.x > 0.0f) ? float(gradient_f2.x) : 0.0f,
+                      (feature_f2.y > 0.0f) ? float(gradient_f2.y) : 0.0f);
+      // Convert back to half2.
+      half2 backprop_h2 = __float22half2_rn(backprop_f2);
+#endif
+      p_backprop_h2[i] = backprop_h2;
+    }
+    // Write back the result.
+    *p_backprop_h8 = backprop_h8;
+  }
+
+  int remaining_count = (count % VectorSizeElements);
+
+  if (index < remaining_count) {
+    // Use first threads to process the remaining elements.
+    Eigen::half grad_h = gradient[half8_count * VectorSizeElements + index];
+    Eigen::half feature_h = feature[half8_count * VectorSizeElements + index];
+
+    float grad_f = static_cast<float>(grad_h);
+    float feature_f = static_cast<float>(feature_h);
+    float backprop_f = (feature_f > 0) ? grad_f : 0;
+
+    Eigen::half backprop_h(backprop_f);
+    backprop[half8_count * VectorSizeElements + index] = backprop_h;
+  }
+}
+
 template <typename Device>
 struct ReluGrad<Device, Eigen::half> {
   // Computes ReluGrad backprop.
@@ -108,15 +169,28 @@ struct ReluGrad<Device, Eigen::half> {
     // NOTE: When the activation is exactly zero, we do not propagate the
     // associated gradient value. This allows the output of the Relu to be used,
     // as well as its input.
+    auto gradient_ptr = reinterpret_cast<uintptr_t>(gradient.data());
+    auto feature_ptr = reinterpret_cast<uintptr_t>(feature.data());
+    auto backprop_ptr = reinterpret_cast<uintptr_t>(backprop.data());
+    bool aligned = gradient_ptr % 16 == 0 && feature_ptr % 16 == 0 &&
+                   backprop_ptr % 16 == 0;
     int32 count = gradient.size();
-    if (count == 0) return;
-    int32 half2_count = Eigen::divup(count, 2);
     constexpr int32 kThreadInBlock = 512;
-    GpuLaunchConfig config = GetGpuLaunchConfigFixedBlockSize(
-        half2_count, d, ReluGradHalfKernel, 0, kThreadInBlock);
-    TF_CHECK_OK(GpuLaunchKernel(
-        ReluGradHalfKernel, config.block_count, config.thread_per_block, 0,
-        d.stream(), gradient.data(), feature.data(), backprop.data(), count));
+    if (count == 0) return;
+    if (aligned) {
+      int32 half8_count = Eigen::divup(count, VectorSizeElements);
+      int32 kBlock = Eigen::divup(half8_count, kThreadInBlock);
+      TF_CHECK_OK(GpuLaunchKernel(
+          ReluGradHalfKernelVector, kBlock, kThreadInBlock, 0, d.stream(),
+          gradient.data(), feature.data(), backprop.data(), count));
+    } else {
+      int32 half2_count = Eigen::divup(count, 2);
+      GpuLaunchConfig config = GetGpuLaunchConfigFixedBlockSize(
+          half2_count, d, ReluGradHalfKernel, 0, kThreadInBlock);
+      TF_CHECK_OK(GpuLaunchKernel(
+          ReluGradHalfKernel, config.block_count, config.thread_per_block, 0,
+          d.stream(), gradient.data(), feature.data(), backprop.data(), count));
+    }
   }
 };
 
diff --git a/tensorflow/core/kernels/resource_variable_ops.cc b/tensorflow/core/kernels/resource_variable_ops.cc
index 0fc1d53749f..510e95ca606 100644
--- a/tensorflow/core/kernels/resource_variable_ops.cc
+++ b/tensorflow/core/kernels/resource_variable_ops.cc
@@ -222,8 +222,6 @@ VarHandleOp::VarHandleOp(OpKernelConstruction* context) : OpKernel(context) {
   OP_REQUIRES_OK(context, context->GetAttr("dtype", &dtype_and_shape_.dtype));
   PartialTensorShape shape;
   OP_REQUIRES_OK(context, context->GetAttr("shape", &dtype_and_shape_.shape));
-  OP_REQUIRES_OK(context,
-                 context->GetAttr("allowed_devices", &allowed_devices_));
 
   is_anonymous_ = name_ == ResourceHandle::ANONYMOUS_NAME;
 
@@ -234,8 +232,7 @@ VarHandleOp::VarHandleOp(OpKernelConstruction* context) : OpKernel(context) {
                                                    &resource_, attr));
     resource_.scalar<ResourceHandle>()() = MakeResourceHandle<Var>(
         context, container_, name_,
-        std::vector<DtypeAndPartialTensorShape>{dtype_and_shape_},
-        allowed_devices_);
+        std::vector<DtypeAndPartialTensorShape>{dtype_and_shape_});
   }
 }
 
@@ -248,8 +245,7 @@ void VarHandleOp::Compute(OpKernelContext* ctx) {
         ctx, ctx->allocate_temp(DT_RESOURCE, TensorShape({}), &handle, attr));
     handle.scalar<ResourceHandle>()() = MakeResourceHandle<Var>(
         ctx, container_, name_,
-        std::vector<DtypeAndPartialTensorShape>{dtype_and_shape_},
-        allowed_devices_);
+        std::vector<DtypeAndPartialTensorShape>{dtype_and_shape_});
     ctx->set_output(0, handle);
   } else {
     ctx->set_output(0, resource_);
@@ -512,7 +508,6 @@ class AssignVariableOp<Device, Variant> : public OpKernel {
 
 TF_CALL_ALL_TYPES(REGISTER_KERNELS);
 TF_CALL_QUANTIZED_TYPES(REGISTER_KERNELS);
-TF_CALL_uint32(REGISTER_KERNELS);
 #undef REGISTER_KERNELS
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/resource_variable_ops.h b/tensorflow/core/kernels/resource_variable_ops.h
index 5935fa91d21..1bb70b537c1 100644
--- a/tensorflow/core/kernels/resource_variable_ops.h
+++ b/tensorflow/core/kernels/resource_variable_ops.h
@@ -36,10 +36,6 @@ class VarHandleOp : public OpKernel {
   Tensor resource_;
 
   DtypeAndPartialTensorShape dtype_and_shape_;
-
-  // A set of devices containing the resource variable. Set when the output
-  // ResourceHandle represents a per-replica/partitioned resource variable.
-  std::vector<string> allowed_devices_;
 };
 
 class ReadVariableOp : public OpKernel {
diff --git a/tensorflow/core/kernels/special_math/special_math_op_bessel.cc b/tensorflow/core/kernels/special_math/special_math_op_bessel.cc
new file mode 100644
index 00000000000..8efa183655e
--- /dev/null
+++ b/tensorflow/core/kernels/special_math/special_math_op_bessel.cc
@@ -0,0 +1,78 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/cwise_ops_common.h"
+#include "tensorflow/core/kernels/special_math/special_math_op_misc_impl.h"
+
+namespace tensorflow {
+REGISTER3(UnaryOp, CPU, "BesselI0", functor::bessel_i0, Eigen::half, float,
+          double);
+REGISTER3(UnaryOp, CPU, "BesselI1", functor::bessel_i1, Eigen::half, float,
+          double);
+REGISTER3(UnaryOp, CPU, "BesselI0e", functor::bessel_i0e, Eigen::half, float,
+          double);
+REGISTER3(UnaryOp, CPU, "BesselI1e", functor::bessel_i1e, Eigen::half, float,
+          double);
+
+REGISTER3(UnaryOp, CPU, "BesselK0", functor::bessel_k0, Eigen::half, float,
+          double);
+REGISTER3(UnaryOp, CPU, "BesselK1", functor::bessel_k1, Eigen::half, float,
+          double);
+REGISTER3(UnaryOp, CPU, "BesselK0e", functor::bessel_k0e, Eigen::half, float,
+          double);
+REGISTER3(UnaryOp, CPU, "BesselK1e", functor::bessel_k1e, Eigen::half, float,
+          double);
+
+REGISTER3(UnaryOp, CPU, "BesselJ0", functor::bessel_j0, Eigen::half, float,
+          double);
+REGISTER3(UnaryOp, CPU, "BesselJ1", functor::bessel_j1, Eigen::half, float,
+          double);
+
+REGISTER3(UnaryOp, CPU, "BesselY0", functor::bessel_y0, Eigen::half, float,
+          double);
+REGISTER3(UnaryOp, CPU, "BesselY1", functor::bessel_y1, Eigen::half, float,
+          double);
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+REGISTER3(UnaryOp, GPU, "BesselI0", functor::bessel_i0, Eigen::half, float,
+          double);
+REGISTER3(UnaryOp, GPU, "BesselI1", functor::bessel_i1, Eigen::half, float,
+          double);
+REGISTER3(UnaryOp, GPU, "BesselI0e", functor::bessel_i0e, Eigen::half, float,
+          double);
+REGISTER3(UnaryOp, GPU, "BesselI1e", functor::bessel_i1e, Eigen::half, float,
+          double);
+
+REGISTER3(UnaryOp, GPU, "BesselK0", functor::bessel_k0, Eigen::half, float,
+          double);
+REGISTER3(UnaryOp, GPU, "BesselK1", functor::bessel_k1, Eigen::half, float,
+          double);
+REGISTER3(UnaryOp, GPU, "BesselK0e", functor::bessel_k0e, Eigen::half, float,
+          double);
+REGISTER3(UnaryOp, GPU, "BesselK1e", functor::bessel_k1e, Eigen::half, float,
+          double);
+
+REGISTER3(UnaryOp, GPU, "BesselJ0", functor::bessel_j0, Eigen::half, float,
+          double);
+REGISTER3(UnaryOp, GPU, "BesselJ1", functor::bessel_j1, Eigen::half, float,
+          double);
+
+REGISTER3(UnaryOp, GPU, "BesselY0", functor::bessel_y0, Eigen::half, float,
+          double);
+REGISTER3(UnaryOp, GPU, "BesselY1", functor::bessel_y1, Eigen::half, float,
+          double);
+#endif
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/special_math/special_math_op_gpu_bessel.cu.cc b/tensorflow/core/kernels/special_math/special_math_op_gpu_bessel.cu.cc
new file mode 100644
index 00000000000..bfb4f253390
--- /dev/null
+++ b/tensorflow/core/kernels/special_math/special_math_op_gpu_bessel.cu.cc
@@ -0,0 +1,41 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h"
+#include "tensorflow/core/kernels/special_math/special_math_op_misc_impl.h"
+
+namespace tensorflow {
+namespace functor {
+DEFINE_UNARY3(bessel_i0, Eigen::half, float, double);
+DEFINE_UNARY3(bessel_i1, Eigen::half, float, double);
+DEFINE_UNARY3(bessel_i0e, Eigen::half, float, double);
+DEFINE_UNARY3(bessel_i1e, Eigen::half, float, double);
+
+DEFINE_UNARY3(bessel_k0, Eigen::half, float, double);
+DEFINE_UNARY3(bessel_k1, Eigen::half, float, double);
+DEFINE_UNARY3(bessel_k0e, Eigen::half, float, double);
+DEFINE_UNARY3(bessel_k1e, Eigen::half, float, double);
+
+DEFINE_UNARY3(bessel_j0, Eigen::half, float, double);
+DEFINE_UNARY3(bessel_j1, Eigen::half, float, double);
+DEFINE_UNARY3(bessel_y0, Eigen::half, float, double);
+DEFINE_UNARY3(bessel_y1, Eigen::half, float, double);
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/special_math/special_math_op_misc_impl.h b/tensorflow/core/kernels/special_math/special_math_op_misc_impl.h
index 75a04a70cc1..c290c47a384 100644
--- a/tensorflow/core/kernels/special_math/special_math_op_misc_impl.h
+++ b/tensorflow/core/kernels/special_math/special_math_op_misc_impl.h
@@ -685,6 +685,44 @@ struct fresnel_sin : base<T, Eigen::internal::fresnel_sin_op<T>> {};
 template <typename T>
 struct spence : base<T, Eigen::internal::spence_op<T>> {};
 
+// Bessel Functions
+
+template <typename T>
+struct bessel_i0 : base<T, Eigen::internal::scalar_bessel_i0_op<T>> {};
+
+template <typename T>
+struct bessel_i0e : base<T, Eigen::internal::scalar_bessel_i0e_op<T>> {};
+
+template <typename T>
+struct bessel_i1 : base<T, Eigen::internal::scalar_bessel_i1_op<T>> {};
+
+template <typename T>
+struct bessel_i1e : base<T, Eigen::internal::scalar_bessel_i1e_op<T>> {};
+
+template <typename T>
+struct bessel_k0 : base<T, Eigen::internal::scalar_bessel_k0_op<T>> {};
+
+template <typename T>
+struct bessel_k0e : base<T, Eigen::internal::scalar_bessel_k0e_op<T>> {};
+
+template <typename T>
+struct bessel_k1 : base<T, Eigen::internal::scalar_bessel_k1_op<T>> {};
+
+template <typename T>
+struct bessel_k1e : base<T, Eigen::internal::scalar_bessel_k1e_op<T>> {};
+
+template <typename T>
+struct bessel_j0 : base<T, Eigen::internal::scalar_bessel_j0_op<T>> {};
+
+template <typename T>
+struct bessel_j1 : base<T, Eigen::internal::scalar_bessel_j1_op<T>> {};
+
+template <typename T>
+struct bessel_y0 : base<T, Eigen::internal::scalar_bessel_y0_op<T>> {};
+
+template <typename T>
+struct bessel_y1 : base<T, Eigen::internal::scalar_bessel_y1_op<T>> {};
+
 }  // end namespace functor
 }  // end namespace tensorflow
 
diff --git a/tensorflow/core/kernels/split_lib_cpu.cc b/tensorflow/core/kernels/split_lib_cpu.cc
index 0cb0a94d498..a3060e4e90d 100644
--- a/tensorflow/core/kernels/split_lib_cpu.cc
+++ b/tensorflow/core/kernels/split_lib_cpu.cc
@@ -43,7 +43,6 @@ void Split<Eigen::ThreadPoolDevice, T, NDims>::operator()(
 
 TF_CALL_ALL_TYPES(DEFINE_CPU_KERNELS)
 DEFINE_CPU_KERNELS(quint8)
-DEFINE_CPU_KERNELS(uint64)
 
 #ifdef TENSORFLOW_USE_SYCL
 template <typename T, int NDims>
diff --git a/tensorflow/core/kernels/split_op.cc b/tensorflow/core/kernels/split_op.cc
index f09740c6198..08575f01f67 100644
--- a/tensorflow/core/kernels/split_op.cc
+++ b/tensorflow/core/kernels/split_op.cc
@@ -404,7 +404,6 @@ class SplitOpSYCL : public SplitOpBase<SYCLDevice, T> {
 
 TF_CALL_ALL_TYPES(REGISTER_SPLIT);
 REGISTER_SPLIT(quint8);
-REGISTER_SPLIT(uint64);
 
 #undef REGISTER_SPLIT
 
diff --git a/tensorflow/core/kernels/strided_slice_op.cc b/tensorflow/core/kernels/strided_slice_op.cc
index ccc1984bb98..b4099213303 100644
--- a/tensorflow/core/kernels/strided_slice_op.cc
+++ b/tensorflow/core/kernels/strided_slice_op.cc
@@ -440,8 +440,6 @@ class StridedSliceAssignOp : public OpKernel {
                           StridedSliceAssignOp<CPUDevice, type, true>)
 
 TF_CALL_ALL_TYPES(REGISTER_STRIDED_SLICE);
-TF_CALL_uint32(REGISTER_STRIDED_SLICE);
-TF_CALL_uint64(REGISTER_STRIDED_SLICE);
 
 #undef REGISTER_STRIDED_SLICE
 
diff --git a/tensorflow/core/kernels/strided_slice_op_impl.h b/tensorflow/core/kernels/strided_slice_op_impl.h
index 1ae959b7b3f..5ce1d773e33 100644
--- a/tensorflow/core/kernels/strided_slice_op_impl.h
+++ b/tensorflow/core/kernels/strided_slice_op_impl.h
@@ -287,8 +287,6 @@ TF_CALL_GPU_ALL_TYPES(DECLARE_FOR_N_GPU);
 #endif  // END GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 TF_CALL_ALL_TYPES(DECLARE_FOR_N_CPU);
-TF_CALL_uint32(DECLARE_FOR_N_CPU);
-TF_CALL_uint64(DECLARE_FOR_N_CPU);
 
 #ifdef TENSORFLOW_USE_SYCL
 #define PREVENT_FOR_N_SYCL(T) \
diff --git a/tensorflow/core/kernels/summary_op.cc b/tensorflow/core/kernels/summary_op.cc
index 386a8964dba..f4c91fc9ff1 100644
--- a/tensorflow/core/kernels/summary_op.cc
+++ b/tensorflow/core/kernels/summary_op.cc
@@ -52,7 +52,8 @@ class SummaryScalarOp : public OpKernel {
     Summary s;
     for (int i = 0; i < Ttags.size(); i++) {
       Summary::Value* v = s.add_value();
-      v->set_tag(string(Ttags(i)));  // NOLINT
+      const tstring& Ttags_i = Ttags(i);
+      v->set_tag(Ttags_i.data(), Ttags_i.size());
       v->set_simple_value(float(Tvalues(i)));
     }
 
@@ -102,7 +103,8 @@ class SummaryHistoOp : public OpKernel {
 
     Summary s;
     Summary::Value* v = s.add_value();
-    v->set_tag(string(tags.scalar<tstring>()()));  // NOLINT
+    const tstring& tags0 = tags.scalar<tstring>()();
+    v->set_tag(tags0.data(), tags0.size());
     histo.EncodeToProto(v->mutable_histo(), false /* Drop zero buckets */);
 
     Tensor* summary_tensor = nullptr;
diff --git a/tensorflow/core/kernels/topk_op.cc b/tensorflow/core/kernels/topk_op.cc
index c555b42f005..50325b7bcfe 100644
--- a/tensorflow/core/kernels/topk_op.cc
+++ b/tensorflow/core/kernels/topk_op.cc
@@ -258,7 +258,6 @@ namespace functor {
 
 TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPEC);
 TF_CALL_INTEGRAL_TYPES(DECLARE_GPU_SPEC);
-TF_CALL_uint32(DECLARE_GPU_SPEC);
 
 #undef DECLARE_GPU_SPEC
 
@@ -276,7 +275,6 @@ TF_CALL_uint32(DECLARE_GPU_SPEC);
 
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_KERNELS);
 TF_CALL_INTEGRAL_TYPES(REGISTER_KERNELS);
-TF_CALL_uint32(REGISTER_KERNELS)
 #undef REGISTER_KERNELS
 
 #endif  // end GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/topk_op_gpu_uint32.cu.cc b/tensorflow/core/kernels/topk_op_gpu_uint32.cu.cc
new file mode 100644
index 00000000000..16e2e0e9420
--- /dev/null
+++ b/tensorflow/core/kernels/topk_op_gpu_uint32.cu.cc
@@ -0,0 +1,28 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/topk_op.h"
+#include "tensorflow/core/kernels/topk_op_gpu.h"
+
+namespace tensorflow {
+using Eigen::GpuDevice;
+
+template struct functor::TopKFunctor<GPUDevice, uint32>;
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/topk_op_gpu_uint64.cu.cc b/tensorflow/core/kernels/topk_op_gpu_uint64.cu.cc
new file mode 100644
index 00000000000..895247a63a2
--- /dev/null
+++ b/tensorflow/core/kernels/topk_op_gpu_uint64.cu.cc
@@ -0,0 +1,28 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/topk_op.h"
+#include "tensorflow/core/kernels/topk_op_gpu.h"
+
+namespace tensorflow {
+using Eigen::GpuDevice;
+
+template struct functor::TopKFunctor<GPUDevice, uint64>;
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/transpose_functor.h b/tensorflow/core/kernels/transpose_functor.h
index a89fc40d772..0c22b11b7c6 100644
--- a/tensorflow/core/kernels/transpose_functor.h
+++ b/tensorflow/core/kernels/transpose_functor.h
@@ -191,11 +191,13 @@ Status DoTransposeImpl(const Device& d, const Tensor& in,
     case DT_FLOAT:
     case DT_INT32:
     case DT_QINT32:
+    case DT_UINT32:
       Transpose<Device, uint32>::run(d, in, perm, out);
       break;
 
     case DT_DOUBLE:
     case DT_INT64:
+    case DT_UINT64:
       Transpose<Device, uint64>::run(d, in, perm, out);
       break;
 
diff --git a/tensorflow/core/lib/gif/gif_io.cc b/tensorflow/core/lib/gif/gif_io.cc
index dc5406920a4..32e2f6dfa52 100644
--- a/tensorflow/core/lib/gif/gif_io.cc
+++ b/tensorflow/core/lib/gif/gif_io.cc
@@ -55,7 +55,7 @@ static const char* GifErrorStringNonNull(int error_code) {
 
 uint8* Decode(const void* srcdata, int datasize,
               const std::function<uint8*(int, int, int, int)>& allocate_output,
-              string* error_string) {
+              string* error_string, bool expand_animations) {
   int error_code = D_GIF_SUCCEEDED;
   InputBufferInfo info = {reinterpret_cast<const uint8*>(srcdata), datasize};
   GifFileType* gif_file =
@@ -82,10 +82,13 @@ uint8* Decode(const void* srcdata, int datasize,
     return nullptr;
   }
 
+  int target_num_frames = gif_file->ImageCount;
+  if (!expand_animations) target_num_frames = 1;
+
   // Don't request more memory than needed for each frame, preventing OOM
   int max_frame_width = 0;
   int max_frame_height = 0;
-  for (int k = 0; k < gif_file->ImageCount; k++) {
+  for (int k = 0; k < target_num_frames; k++) {
     SavedImage* si = &gif_file->SavedImages[k];
     if (max_frame_height < si->ImageDesc.Height)
       max_frame_height = si->ImageDesc.Height;
@@ -93,14 +96,14 @@ uint8* Decode(const void* srcdata, int datasize,
       max_frame_width = si->ImageDesc.Width;
   }
 
-  const int num_frames = gif_file->ImageCount;
   const int width = max_frame_width;
   const int height = max_frame_height;
   const int channel = 3;
 
-  uint8* const dstdata = allocate_output(num_frames, width, height, channel);
+  uint8* const dstdata =
+      allocate_output(target_num_frames, width, height, channel);
   if (!dstdata) return nullptr;
-  for (int k = 0; k < num_frames; k++) {
+  for (int k = 0; k < target_num_frames; k++) {
     uint8* this_dst = dstdata + k * width * channel * height;
 
     SavedImage* this_image = &gif_file->SavedImages[k];
diff --git a/tensorflow/core/lib/gif/gif_io.h b/tensorflow/core/lib/gif/gif_io.h
index e46a7917398..ae7d5125bd7 100644
--- a/tensorflow/core/lib/gif/gif_io.h
+++ b/tensorflow/core/lib/gif/gif_io.h
@@ -44,7 +44,7 @@ namespace gif {
 
 uint8* Decode(const void* srcdata, int datasize,
               const std::function<uint8*(int, int, int, int)>& allocate_output,
-              string* error_string);
+              string* error_string, bool expand_animations = true);
 
 }  // namespace gif
 }  // namespace tensorflow
diff --git a/tensorflow/core/lib/gtl/manual_constructor_test.cc b/tensorflow/core/lib/gtl/manual_constructor_test.cc
index 35cbc78b664..77824326a7d 100644
--- a/tensorflow/core/lib/gtl/manual_constructor_test.cc
+++ b/tensorflow/core/lib/gtl/manual_constructor_test.cc
@@ -92,7 +92,7 @@ TEST(ManualConstructorTest, Alignment) {
 
   EXPECT_EQ(reinterpret_cast<char*>(test2.b.get()) - &test2.a,
             reinterpret_cast<char*>(&control2.b) - &control2.a);
-#ifdef ARCH_K8
+#ifdef __x86_64__
   EXPECT_EQ(reinterpret_cast<intptr_t>(test2.b.get()) % 16, 0);
 #endif
 }
diff --git a/tensorflow/core/lib/io/BUILD b/tensorflow/core/lib/io/BUILD
index d03a895b429..797e9ad1a4b 100644
--- a/tensorflow/core/lib/io/BUILD
+++ b/tensorflow/core/lib/io/BUILD
@@ -145,6 +145,8 @@ cc_library(
         ":compression",
         ":inputstream_interface",
         ":random_inputstream",
+        ":snappy_compression_options",
+        ":snappy_inputstream",
         ":zlib_compression_options",
         ":zlib_inputstream",
         "//tensorflow/core/lib/core:coding",
@@ -164,6 +166,8 @@ cc_library(
     hdrs = ["record_writer.h"],
     deps = [
         ":compression",
+        ":snappy_compression_options",
+        ":snappy_outputbuffer",
         ":zlib_compression_options",
         ":zlib_outputbuffer",
         "//tensorflow/core/lib/core:coding",
@@ -208,6 +212,28 @@ cc_library(
     alwayslink = True,
 )
 
+cc_library(
+    name = "snappy_inputstream",
+    srcs = ["snappy/snappy_inputstream.cc"],
+    hdrs = ["snappy/snappy_inputstream.h"],
+    deps = [
+        ":inputstream_interface",
+        "//tensorflow/core/platform:errors",
+        "//tensorflow/core/platform:platform_port",
+        "@com_google_absl//absl/memory",
+    ],
+    alwayslink = True,
+)
+
+cc_library(
+    name = "snappy_compression_options",
+    hdrs = ["snappy/snappy_compression_options.h"],
+    deps = [
+        "//tensorflow/core/platform:types",
+    ],
+    alwayslink = True,
+)
+
 cc_library(
     name = "cache",
     srcs = [
@@ -323,6 +349,9 @@ filegroup(
         "random_inputstream.h",
         "record_reader.cc",
         "record_reader.h",
+        "snappy/snappy_compression_options.h",
+        "snappy/snappy_inputstream.cc",
+        "snappy/snappy_inputstream.h",
         "table.cc",
         "table.h",
         "table_builder.cc",
@@ -353,7 +382,9 @@ filegroup(
         "random_inputstream.h",
         "record_reader.h",
         "record_writer.h",
+        "snappy/snappy_compression_options.h",
         "snappy/snappy_inputbuffer.h",
+        "snappy/snappy_inputstream.h",
         "snappy/snappy_outputbuffer.h",
         "table.h",
         "table_builder.h",
@@ -377,7 +408,7 @@ filegroup(
         "random_inputstream_test.cc",
         "record_reader_writer_test.cc",
         "recordio_test.cc",
-        "snappy/snappy_buffers_test.cc",
+        "snappy/snappy_test.cc",
         "table_test.cc",
         "zlib_buffers_test.cc",
     ],
@@ -408,7 +439,9 @@ filegroup(
     srcs = [
         "inputbuffer.h",
         "iterator.h",
+        "snappy/snappy_compression_options.h",
         "snappy/snappy_inputbuffer.h",
+        "snappy/snappy_inputstream.h",
         "snappy/snappy_outputbuffer.h",
         "zlib_compression_options.h",
         "zlib_inputstream.h",
diff --git a/tensorflow/core/lib/io/record_reader.cc b/tensorflow/core/lib/io/record_reader.cc
index 1af81bd902c..40e516f5ef9 100644
--- a/tensorflow/core/lib/io/record_reader.cc
+++ b/tensorflow/core/lib/io/record_reader.cc
@@ -31,26 +31,26 @@ namespace io {
 RecordReaderOptions RecordReaderOptions::CreateRecordReaderOptions(
     const string& compression_type) {
   RecordReaderOptions options;
+
+#if defined(IS_SLIM_BUILD)
+  if (compression_type != compression::kNone) {
+    LOG(ERROR) << "Compression is not supported but compression_type is set."
+               << " No compression will be used.";
+  }
+#else
   if (compression_type == compression::kZlib) {
     options.compression_type = io::RecordReaderOptions::ZLIB_COMPRESSION;
-#if defined(IS_SLIM_BUILD)
-    LOG(ERROR) << "Compression is not supported but compression_type is set."
-               << " No compression will be used.";
-#else
     options.zlib_options = io::ZlibCompressionOptions::DEFAULT();
-#endif  // IS_SLIM_BUILD
   } else if (compression_type == compression::kGzip) {
     options.compression_type = io::RecordReaderOptions::ZLIB_COMPRESSION;
-#if defined(IS_SLIM_BUILD)
-    LOG(ERROR) << "Compression is not supported but compression_type is set."
-               << " No compression will be used.";
-#else
     options.zlib_options = io::ZlibCompressionOptions::GZIP();
-#endif  // IS_SLIM_BUILD
+  } else if (compression_type == compression::kSnappy) {
+    options.compression_type = io::RecordReaderOptions::SNAPPY_COMPRESSION;
   } else if (compression_type != compression::kNone) {
     LOG(ERROR) << "Unsupported compression_type:" << compression_type
                << ". No compression will be used.";
   }
+#endif
   return options;
 }
 
@@ -63,20 +63,26 @@ RecordReader::RecordReader(RandomAccessFile* file,
     input_stream_.reset(new BufferedInputStream(input_stream_.release(),
                                                 options.buffer_size, true));
   }
-  if (options.compression_type == RecordReaderOptions::ZLIB_COMPRESSION) {
-// We don't have zlib available on all embedded platforms, so fail.
 #if defined(IS_SLIM_BUILD)
-    LOG(FATAL) << "Zlib compression is unsupported on mobile platforms.";
-#else   // IS_SLIM_BUILD
+  if (options.compression_type != RecordReaderOptions::NONE) {
+    LOG(FATAL) << "Compression is unsupported on mobile platforms.";
+  }
+#else
+  if (options.compression_type == RecordReaderOptions::ZLIB_COMPRESSION) {
     input_stream_.reset(new ZlibInputStream(
         input_stream_.release(), options.zlib_options.input_buffer_size,
         options.zlib_options.output_buffer_size, options.zlib_options, true));
-#endif  // IS_SLIM_BUILD
+  } else if (options.compression_type ==
+             RecordReaderOptions::SNAPPY_COMPRESSION) {
+    input_stream_.reset(
+        new SnappyInputStream(input_stream_.release(),
+                              options.snappy_options.output_buffer_size, true));
   } else if (options.compression_type == RecordReaderOptions::NONE) {
     // Nothing to do.
   } else {
     LOG(FATAL) << "Unrecognized compression type :" << options.compression_type;
   }
+#endif
 }
 
 // Read n+4 bytes from file, verify that checksum of first n bytes is
diff --git a/tensorflow/core/lib/io/record_reader.h b/tensorflow/core/lib/io/record_reader.h
index dd7def79f05..07709990a64 100644
--- a/tensorflow/core/lib/io/record_reader.h
+++ b/tensorflow/core/lib/io/record_reader.h
@@ -20,6 +20,8 @@ limitations under the License.
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/io/inputstream_interface.h"
 #if !defined(IS_SLIM_BUILD)
+#include "tensorflow/core/lib/io/snappy/snappy_compression_options.h"
+#include "tensorflow/core/lib/io/snappy/snappy_inputstream.h"
 #include "tensorflow/core/lib/io/zlib_compression_options.h"
 #include "tensorflow/core/lib/io/zlib_inputstream.h"
 #endif  // IS_SLIM_BUILD
@@ -32,9 +34,12 @@ class RandomAccessFile;
 
 namespace io {
 
-class RecordReaderOptions {
- public:
-  enum CompressionType { NONE = 0, ZLIB_COMPRESSION = 1 };
+struct RecordReaderOptions {
+  enum CompressionType {
+    NONE = 0,
+    ZLIB_COMPRESSION = 1,
+    SNAPPY_COMPRESSION = 2
+  };
   CompressionType compression_type = NONE;
 
   // If buffer_size is non-zero, then all reads must be sequential, and no
@@ -46,8 +51,9 @@ class RecordReaderOptions {
       const string& compression_type);
 
 #if !defined(IS_SLIM_BUILD)
-  // Options specific to zlib compression.
+  // Options specific to compression.
   ZlibCompressionOptions zlib_options;
+  SnappyCompressionOptions snappy_options;
 #endif  // IS_SLIM_BUILD
 };
 
diff --git a/tensorflow/core/lib/io/record_reader_writer_test.cc b/tensorflow/core/lib/io/record_reader_writer_test.cc
index 373c0d8b664..486b238bd29 100644
--- a/tensorflow/core/lib/io/record_reader_writer_test.cc
+++ b/tensorflow/core/lib/io/record_reader_writer_test.cc
@@ -158,6 +158,44 @@ TEST(RecordReaderWriterTest, TestBasics) {
   }
 }
 
+TEST(RecordReaderWriterTest, TestSnappy) {
+  Env* env = Env::Default();
+  string fname = testing::TmpDir() + "/record_reader_writer_snappy_test";
+
+  for (auto buf_size : BufferSizes()) {
+    // Snappy compression needs output buffer size > 1.
+    if (buf_size == 1) continue;
+    {
+      std::unique_ptr<WritableFile> file;
+      TF_CHECK_OK(env->NewWritableFile(fname, &file));
+
+      io::RecordWriterOptions options;
+      options.compression_type = io::RecordWriterOptions::SNAPPY_COMPRESSION;
+      options.zlib_options.output_buffer_size = buf_size;
+      io::RecordWriter writer(file.get(), options);
+      TF_EXPECT_OK(writer.WriteRecord("abc"));
+      TF_EXPECT_OK(writer.WriteRecord("defg"));
+      TF_CHECK_OK(writer.Flush());
+    }
+
+    {
+      std::unique_ptr<RandomAccessFile> read_file;
+      // Read it back with the RecordReader.
+      TF_CHECK_OK(env->NewRandomAccessFile(fname, &read_file));
+      io::RecordReaderOptions options;
+      options.compression_type = io::RecordReaderOptions::SNAPPY_COMPRESSION;
+      options.zlib_options.input_buffer_size = buf_size;
+      io::RecordReader reader(read_file.get(), options);
+      uint64 offset = 0;
+      tstring record;
+      TF_CHECK_OK(reader.ReadRecord(&offset, &record));
+      EXPECT_EQ("abc", record);
+      TF_CHECK_OK(reader.ReadRecord(&offset, &record));
+      EXPECT_EQ("defg", record);
+    }
+  }
+}
+
 TEST(RecordReaderWriterTest, TestZlib) {
   Env* env = Env::Default();
   string fname = testing::TmpDir() + "/record_reader_writer_zlib_test";
diff --git a/tensorflow/core/lib/io/record_writer.cc b/tensorflow/core/lib/io/record_writer.cc
index 52d0ef9a358..c82963d40c2 100644
--- a/tensorflow/core/lib/io/record_writer.cc
+++ b/tensorflow/core/lib/io/record_writer.cc
@@ -23,45 +23,49 @@ limitations under the License.
 namespace tensorflow {
 namespace io {
 namespace {
-bool IsZlibCompressed(RecordWriterOptions options) {
+bool IsZlibCompressed(const RecordWriterOptions& options) {
   return options.compression_type == RecordWriterOptions::ZLIB_COMPRESSION;
 }
+
+bool IsSnappyCompressed(const RecordWriterOptions& options) {
+  return options.compression_type == RecordWriterOptions::SNAPPY_COMPRESSION;
+}
 }  // namespace
 
 RecordWriterOptions RecordWriterOptions::CreateRecordWriterOptions(
     const string& compression_type) {
   RecordWriterOptions options;
+#if defined(IS_SLIM_BUILD)
+  if (compression_type != compression::kNone) {
+    LOG(ERROR) << "Compression is not supported but compression_type is set."
+               << " No compression will be used.";
+  }
+#else
   if (compression_type == compression::kZlib) {
     options.compression_type = io::RecordWriterOptions::ZLIB_COMPRESSION;
-#if defined(IS_SLIM_BUILD)
-    LOG(ERROR) << "Compression is not supported but compression_type is set."
-               << " No compression will be used.";
-#else
     options.zlib_options = io::ZlibCompressionOptions::DEFAULT();
-#endif  // IS_SLIM_BUILD
   } else if (compression_type == compression::kGzip) {
     options.compression_type = io::RecordWriterOptions::ZLIB_COMPRESSION;
-#if defined(IS_SLIM_BUILD)
-    LOG(ERROR) << "Compression is not supported but compression_type is set."
-               << " No compression will be used.";
-#else
     options.zlib_options = io::ZlibCompressionOptions::GZIP();
-#endif  // IS_SLIM_BUILD
+  } else if (compression_type == compression::kSnappy) {
+    options.compression_type = io::RecordWriterOptions::SNAPPY_COMPRESSION;
   } else if (compression_type != compression::kNone) {
     LOG(ERROR) << "Unsupported compression_type:" << compression_type
                << ". No compression will be used.";
   }
+#endif
   return options;
 }
 
 RecordWriter::RecordWriter(WritableFile* dest,
                            const RecordWriterOptions& options)
     : dest_(dest), options_(options) {
-  if (IsZlibCompressed(options)) {
-// We don't have zlib available on all embedded platforms, so fail.
 #if defined(IS_SLIM_BUILD)
-    LOG(FATAL) << "Zlib compression is unsupported on mobile platforms.";
-#else   // IS_SLIM_BUILD
+  if (compression_type != compression::kNone) {
+    LOG(FATAL) << "Compression is unsupported on mobile platforms.";
+  }
+#else
+  if (IsZlibCompressed(options)) {
     ZlibOutputBuffer* zlib_output_buffer = new ZlibOutputBuffer(
         dest, options.zlib_options.input_buffer_size,
         options.zlib_options.output_buffer_size, options.zlib_options);
@@ -71,12 +75,16 @@ RecordWriter::RecordWriter(WritableFile* dest,
                  << s.ToString();
     }
     dest_ = zlib_output_buffer;
-#endif  // IS_SLIM_BUILD
+  } else if (IsSnappyCompressed(options)) {
+    dest_ =
+        new SnappyOutputBuffer(dest, options.snappy_options.input_buffer_size,
+                               options.snappy_options.output_buffer_size);
   } else if (options.compression_type == RecordWriterOptions::NONE) {
     // Nothing to do
   } else {
     LOG(FATAL) << "Unspecified compression type :" << options.compression_type;
   }
+#endif
 }
 
 RecordWriter::~RecordWriter() {
@@ -130,14 +138,12 @@ Status RecordWriter::WriteRecord(const absl::Cord& data) {
 
 Status RecordWriter::Close() {
   if (dest_ == nullptr) return Status::OK();
-#if !defined(IS_SLIM_BUILD)
-  if (IsZlibCompressed(options_)) {
+  if (IsZlibCompressed(options_) || IsSnappyCompressed(options_)) {
     Status s = dest_->Close();
     delete dest_;
     dest_ = nullptr;
     return s;
   }
-#endif  // IS_SLIM_BUILD
   return Status::OK();
 }
 
diff --git a/tensorflow/core/lib/io/record_writer.h b/tensorflow/core/lib/io/record_writer.h
index 012c2fbbc91..243dc847ec5 100644
--- a/tensorflow/core/lib/io/record_writer.h
+++ b/tensorflow/core/lib/io/record_writer.h
@@ -21,6 +21,8 @@ limitations under the License.
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/hash/crc32c.h"
 #if !defined(IS_SLIM_BUILD)
+#include "tensorflow/core/lib/io/snappy/snappy_compression_options.h"
+#include "tensorflow/core/lib/io/snappy/snappy_outputbuffer.h"
 #include "tensorflow/core/lib/io/zlib_compression_options.h"
 #include "tensorflow/core/lib/io/zlib_outputbuffer.h"
 #endif  // IS_SLIM_BUILD
@@ -34,17 +36,22 @@ class WritableFile;
 
 namespace io {
 
-class RecordWriterOptions {
+struct RecordWriterOptions {
  public:
-  enum CompressionType { NONE = 0, ZLIB_COMPRESSION = 1 };
+  enum CompressionType {
+    NONE = 0,
+    ZLIB_COMPRESSION = 1,
+    SNAPPY_COMPRESSION = 2
+  };
   CompressionType compression_type = NONE;
 
   static RecordWriterOptions CreateRecordWriterOptions(
       const string& compression_type);
 
-// Options specific to zlib compression.
 #if !defined(IS_SLIM_BUILD)
+  // Options specific to compression.
   tensorflow::io::ZlibCompressionOptions zlib_options;
+  tensorflow::io::SnappyCompressionOptions snappy_options;
 #endif  // IS_SLIM_BUILD
 };
 
@@ -70,7 +77,7 @@ class RecordWriter {
   // implicit Close() call in the destructor.
   ~RecordWriter();
 
-  Status WriteRecord(StringPiece slice);
+  Status WriteRecord(StringPiece data);
 
 #if defined(PLATFORM_GOOGLE)
   Status WriteRecord(const absl::Cord& data);
diff --git a/tensorflow/c/experimental/saved_model/core/ops/owned_tensor.h b/tensorflow/core/lib/io/snappy/snappy_compression_options.h
similarity index 54%
rename from tensorflow/c/experimental/saved_model/core/ops/owned_tensor.h
rename to tensorflow/core/lib/io/snappy/snappy_compression_options.h
index 335d9e46c7a..d3d798bfa8f 100644
--- a/tensorflow/c/experimental/saved_model/core/ops/owned_tensor.h
+++ b/tensorflow/core/lib/io/snappy/snappy_compression_options.h
@@ -13,30 +13,24 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_OPS_OWNED_TENSOR_H_
-#define TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_OPS_OWNED_TENSOR_H_
+#ifndef TENSORFLOW_CORE_LIB_IO_SNAPPY_SNAPPY_COMPRESSION_OPTIONS_H_
+#define TENSORFLOW_CORE_LIB_IO_SNAPPY_SNAPPY_COMPRESSION_OPTIONS_H_
 
-#include <memory>
-
-#include "tensorflow/c/tensor_interface.h"
+#include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
-namespace internal {
+namespace io {
 
-struct AbstractTensorInterfaceDeleter {
-  void operator()(AbstractTensorInterface* p) const {
-    if (p != nullptr) {
-      p->Release();
-    }
-  }
+struct SnappyCompressionOptions {
+  // Size of the buffer used for caching the data read from source file.
+  int64 input_buffer_size = 256 << 10;
+
+  // Size of the sink buffer where the compressed/decompressed data produced by
+  // snappy is cached.
+  int64 output_buffer_size = 256 << 10;
 };
 
-}  // namespace internal
-
-using AbstractTensorPtr =
-    std::unique_ptr<AbstractTensorInterface,
-                    internal::AbstractTensorInterfaceDeleter>;
-
+}  // namespace io
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_OPS_OWNED_TENSOR_H_
+#endif  // TENSORFLOW_CORE_LIB_IO_SNAPPY_SNAPPY_COMPRESSION_OPTIONS_H_
diff --git a/tensorflow/core/lib/io/snappy/snappy_inputstream.cc b/tensorflow/core/lib/io/snappy/snappy_inputstream.cc
new file mode 100644
index 00000000000..7e77971f4f1
--- /dev/null
+++ b/tensorflow/core/lib/io/snappy/snappy_inputstream.cc
@@ -0,0 +1,153 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/lib/io/snappy/snappy_inputstream.h"
+
+#include "absl/memory/memory.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/snappy.h"
+
+namespace tensorflow {
+namespace io {
+
+SnappyInputStream::SnappyInputStream(InputStreamInterface* input_stream,
+                                     size_t output_buffer_bytes,
+                                     bool owns_input_stream)
+    : input_stream_(input_stream),
+      output_buffer_bytes_(output_buffer_bytes),
+      owns_input_stream_(owns_input_stream),
+      bytes_read_(0),
+      output_buffer_(new char[output_buffer_bytes]),
+      next_out_(nullptr),
+      avail_out_(0) {}
+
+SnappyInputStream::SnappyInputStream(InputStreamInterface* input_stream,
+                                     size_t output_buffer_bytes)
+    : SnappyInputStream(input_stream, output_buffer_bytes, false) {}
+
+SnappyInputStream::~SnappyInputStream() {
+  if (owns_input_stream_) {
+    delete input_stream_;
+  }
+}
+
+Status SnappyInputStream::ReadNBytes(int64 bytes_to_read, tstring* result) {
+  result->clear();
+  result->resize_uninitialized(bytes_to_read);
+
+  char* result_ptr = result->mdata();
+
+  // Read as many bytes as possible from the cache.
+  size_t bytes_read = ReadBytesFromCache(bytes_to_read, result_ptr);
+  bytes_to_read -= bytes_read;
+  result_ptr += bytes_read;
+
+  while (bytes_to_read > 0) {
+    DCHECK_EQ(avail_out_, 0);
+
+    // Fill the cache with more data.
+    TF_RETURN_IF_ERROR(Inflate());
+
+    size_t bytes_read = ReadBytesFromCache(bytes_to_read, result_ptr);
+    bytes_to_read -= bytes_read;
+    result_ptr += bytes_read;
+  }
+
+  return Status::OK();
+}
+
+#if defined(PLATFORM_GOOGLE)
+Status SnappyInputStream::ReadNBytes(int64 bytes_to_read, absl::Cord* result) {
+  // TODO(frankchn): Optimize this instead of bouncing through the buffer.
+  tstring buf;
+  TF_RETURN_IF_ERROR(ReadNBytes(bytes_to_read, &buf));
+  result->Clear();
+  result->Append(buf.data());
+  return Status::OK();
+}
+#endif
+
+Status SnappyInputStream::Inflate() {
+  tstring compressed_block_length_ts;
+  uint32 compressed_block_length;
+
+  TF_RETURN_IF_ERROR(
+      input_stream_->ReadNBytes(sizeof(uint32), &compressed_block_length_ts));
+  for (int i = 0; i < sizeof(uint32); ++i) {
+    compressed_block_length =
+        (compressed_block_length << 8) |
+        static_cast<unsigned char>(compressed_block_length_ts.data()[i]);
+  }
+
+  tstring compressed_block;
+  compressed_block.resize_uninitialized(compressed_block_length);
+
+  Status s =
+      input_stream_->ReadNBytes(compressed_block_length, &compressed_block);
+  if (errors::IsOutOfRange(s)) {
+    return errors::DataLoss("Failed to read ", compressed_block_length,
+                            " bytes from file. Possible data corruption.");
+  }
+  TF_RETURN_IF_ERROR(s);
+
+  size_t uncompressed_length;
+  if (!port::Snappy_GetUncompressedLength(compressed_block.data(),
+                                          compressed_block_length,
+                                          &uncompressed_length)) {
+    return errors::DataLoss("Parsing error in Snappy_GetUncompressedLength");
+  }
+
+  DCHECK_EQ(avail_out_, 0);
+  if (output_buffer_bytes_ < uncompressed_length) {
+    return errors::ResourceExhausted(
+        "Output buffer(size: ", output_buffer_bytes_,
+        " bytes"
+        ") too small. Should be larger than ",
+        uncompressed_length, " bytes.");
+  }
+
+  next_out_ = output_buffer_.get();
+  if (!port::Snappy_Uncompress(compressed_block.data(), compressed_block_length,
+                               output_buffer_.get())) {
+    return errors::DataLoss("Snappy_Uncompress failed.");
+  }
+  avail_out_ += uncompressed_length;
+
+  return Status::OK();
+}
+
+size_t SnappyInputStream::ReadBytesFromCache(size_t bytes_to_read,
+                                             char* result) {
+  size_t can_read_bytes = std::min(bytes_to_read, avail_out_);
+  if (can_read_bytes) {
+    memcpy(result, next_out_, can_read_bytes);
+    next_out_ += can_read_bytes;
+    avail_out_ -= can_read_bytes;
+  }
+  bytes_read_ += can_read_bytes;
+  return can_read_bytes;
+}
+
+int64 SnappyInputStream::Tell() const { return bytes_read_; }
+
+Status SnappyInputStream::Reset() {
+  TF_RETURN_IF_ERROR(input_stream_->Reset());
+  avail_out_ = 0;
+  bytes_read_ = 0;
+  return Status::OK();
+}
+
+}  // namespace io
+}  // namespace tensorflow
diff --git a/tensorflow/core/lib/io/snappy/snappy_inputstream.h b/tensorflow/core/lib/io/snappy/snappy_inputstream.h
new file mode 100644
index 00000000000..bbe8eaf0dda
--- /dev/null
+++ b/tensorflow/core/lib/io/snappy/snappy_inputstream.h
@@ -0,0 +1,89 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_LIB_IO_SNAPPY_SNAPPY_INPUTSTREAM_H_
+#define TENSORFLOW_CORE_LIB_IO_SNAPPY_SNAPPY_INPUTSTREAM_H_
+
+#include "tensorflow/core/lib/io/inputstream_interface.h"
+
+namespace tensorflow {
+namespace io {
+
+class SnappyInputStream : public InputStreamInterface {
+ public:
+  // Creates a SnappyInputStream for `input_stream`.
+  //
+  // Takes ownership  of `input_stream` iff `owns_input_stream` is true.
+  SnappyInputStream(InputStreamInterface* input_stream,
+                    size_t output_buffer_bytes, bool owns_input_stream);
+
+  // Equivalent to the previous constructor with owns_input_stream = false.
+  explicit SnappyInputStream(InputStreamInterface* input_stream,
+                             size_t output_buffer_bytes);
+
+  ~SnappyInputStream() override;
+
+  // Reads bytes_to_read bytes into *result, overwriting *result.
+  //
+  // Return Status codes:
+  // OK:           If successful.
+  // OUT_OF_RANGE: If there are not enough bytes to read before
+  //               the end of the stream.
+  // ABORTED:      If inflate() fails, we return the error code with the
+  //               error message in `z_stream_->msg`.
+  // others:       If reading from stream failed.
+  Status ReadNBytes(int64 bytes_to_read, tstring* result) override;
+
+#if defined(PLATFORM_GOOGLE)
+  Status ReadNBytes(int64 bytes_to_read, absl::Cord* result) override;
+#endif
+
+  int64 Tell() const override;
+
+  Status Reset() override;
+
+ private:
+  // Decompress the next chunk of data and place the data into the cache.
+  Status Inflate();
+
+  // Attempt to read `bytes_to_read` from the decompressed data cache. Returns
+  // the actual number of bytes read.
+  size_t ReadBytesFromCache(size_t bytes_to_read, char* result);
+
+  InputStreamInterface* input_stream_;
+  const size_t output_buffer_bytes_;
+  const bool owns_input_stream_;
+
+  // Specifies the number of decompressed bytes currently read.
+  int64 bytes_read_;
+
+  // output_buffer_ contains decompressed data not yet read by the client.
+  std::unique_ptr<char[]> output_buffer_;
+
+  // next_out_ points to the position in the `output_buffer_` that contains the
+  // next unread byte.
+  char* next_out_;
+
+  // avail_out_ specifies the number of bytes left in the output_buffers_ that
+  // is not yet read.
+  size_t avail_out_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(SnappyInputStream);
+};
+
+}  // namespace io
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_LIB_IO_SNAPPY_SNAPPY_INPUTSTREAM_H_
diff --git a/tensorflow/core/lib/io/snappy/snappy_buffers_test.cc b/tensorflow/core/lib/io/snappy/snappy_test.cc
similarity index 59%
rename from tensorflow/core/lib/io/snappy/snappy_buffers_test.cc
rename to tensorflow/core/lib/io/snappy/snappy_test.cc
index 521b49b73f2..b7d5eae6cc5 100644
--- a/tensorflow/core/lib/io/snappy/snappy_buffers_test.cc
+++ b/tensorflow/core/lib/io/snappy/snappy_test.cc
@@ -15,7 +15,9 @@ limitations under the License.
 
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/io/inputbuffer.h"
+#include "tensorflow/core/lib/io/random_inputstream.h"
 #include "tensorflow/core/lib/io/snappy/snappy_inputbuffer.h"
+#include "tensorflow/core/lib/io/snappy/snappy_inputstream.h"
 #include "tensorflow/core/lib/io/snappy/snappy_outputbuffer.h"
 
 namespace tensorflow {
@@ -50,18 +52,17 @@ static string GenTestString(int copies = 1) {
   return result;
 }
 
-Status TestMultipleWrites(size_t compress_input_buf_size,
-                          size_t compress_output_buf_size,
-                          size_t uncompress_input_buf_size,
-                          size_t uncompress_output_buf_size, int num_writes = 1,
-                          bool with_flush = false, int num_copies = 1,
-                          bool corrupt_compressed_file = false) {
+Status TestMultipleWritesWriteFile(size_t compress_input_buf_size,
+                                   size_t compress_output_buf_size,
+                                   int num_writes, bool with_flush,
+                                   int num_copies, bool corrupt_compressed_file,
+                                   string& fname, string& data,
+                                   string& expected_result) {
   Env* env = Env::Default();
 
-  string fname = testing::TmpDir() + "/snappy_buffers_test";
-  string data = GenTestString(num_copies);
+  fname = testing::TmpDir() + "/snappy_buffers_test";
+  data = GenTestString(num_copies);
   std::unique_ptr<WritableFile> file_writer;
-  string expected_result;
 
   TF_RETURN_IF_ERROR(env->NewWritableFile(fname, &file_writer));
   io::SnappyOutputBuffer out(file_writer.get(), compress_input_buf_size,
@@ -112,6 +113,25 @@ Status TestMultipleWrites(size_t compress_input_buf_size,
     fname = corrupt_fname;
   }
 
+  return Status::OK();
+}
+
+Status TestMultipleWrites(size_t compress_input_buf_size,
+                          size_t compress_output_buf_size,
+                          size_t uncompress_input_buf_size,
+                          size_t uncompress_output_buf_size, int num_writes = 1,
+                          bool with_flush = false, int num_copies = 1,
+                          bool corrupt_compressed_file = false) {
+  Env* env = Env::Default();
+
+  string expected_result;
+  string fname;
+  string data;
+
+  TF_RETURN_IF_ERROR(TestMultipleWritesWriteFile(
+      compress_input_buf_size, compress_output_buf_size, num_writes, with_flush,
+      num_copies, corrupt_compressed_file, fname, data, expected_result));
+
   std::unique_ptr<RandomAccessFile> file_reader;
   TF_RETURN_IF_ERROR(env->NewRandomAccessFile(fname, &file_reader));
   io::SnappyInputBuffer in(file_reader.get(), uncompress_input_buf_size,
@@ -131,15 +151,56 @@ Status TestMultipleWrites(size_t compress_input_buf_size,
     }
     TF_RETURN_IF_ERROR(in.Reset());
   }
+
   return Status::OK();
 }
 
-void TestTell(size_t compress_input_buf_size, size_t compress_output_buf_size,
-              size_t uncompress_input_buf_size,
-              size_t uncompress_output_buf_size, int num_copies = 1) {
+Status TestMultipleWritesInputStream(
+    size_t compress_input_buf_size, size_t compress_output_buf_size,
+    size_t uncompress_input_buf_size, size_t uncompress_output_buf_size,
+    int num_writes = 1, bool with_flush = false, int num_copies = 1,
+    bool corrupt_compressed_file = false) {
   Env* env = Env::Default();
-  string fname = testing::TmpDir() + "/snappy_buffers_test";
-  string data = GenTestString(num_copies);
+
+  string expected_result;
+  string fname;
+  string data;
+
+  TF_RETURN_IF_ERROR(TestMultipleWritesWriteFile(
+      compress_input_buf_size, compress_output_buf_size, num_writes, with_flush,
+      num_copies, corrupt_compressed_file, fname, data, expected_result));
+
+  std::unique_ptr<RandomAccessFile> file_reader;
+  TF_RETURN_IF_ERROR(env->NewRandomAccessFile(fname, &file_reader));
+  io::RandomAccessInputStream random_input_stream(file_reader.get(), false);
+  io::SnappyInputStream snappy_input_stream(&random_input_stream,
+                                            uncompress_output_buf_size);
+
+  for (int attempt = 0; attempt < 2; ++attempt) {
+    string actual_result;
+    for (int i = 0; i < num_writes; ++i) {
+      tstring decompressed_output;
+      TF_RETURN_IF_ERROR(
+          snappy_input_stream.ReadNBytes(data.size(), &decompressed_output));
+      strings::StrAppend(&actual_result, decompressed_output);
+    }
+
+    if (actual_result.compare(expected_result)) {
+      return errors::DataLoss("Actual and expected results don't match.");
+    }
+    TF_RETURN_IF_ERROR(snappy_input_stream.Reset());
+  }
+  return Status::OK();
+}
+
+void TestTellWriteFile(size_t compress_input_buf_size,
+                       size_t compress_output_buf_size,
+                       size_t uncompress_input_buf_size,
+                       size_t uncompress_output_buf_size, int num_copies,
+                       string& fname, string& data) {
+  Env* env = Env::Default();
+  fname = testing::TmpDir() + "/snappy_buffers_test";
+  data = GenTestString(num_copies);
 
   // Write the compressed file.
   std::unique_ptr<WritableFile> file_writer;
@@ -150,6 +211,18 @@ void TestTell(size_t compress_input_buf_size, size_t compress_output_buf_size,
   TF_CHECK_OK(out.Flush());
   TF_CHECK_OK(file_writer->Flush());
   TF_CHECK_OK(file_writer->Close());
+}
+
+void TestTell(size_t compress_input_buf_size, size_t compress_output_buf_size,
+              size_t uncompress_input_buf_size,
+              size_t uncompress_output_buf_size, int num_copies = 1) {
+  Env* env = Env::Default();
+  string data;
+  string fname;
+
+  TestTellWriteFile(compress_input_buf_size, compress_output_buf_size,
+                    uncompress_input_buf_size, uncompress_output_buf_size,
+                    num_copies, fname, data);
 
   tstring first_half(string(data, 0, data.size() / 2));
   tstring bytes_read;
@@ -175,6 +248,43 @@ void TestTell(size_t compress_input_buf_size, size_t compress_output_buf_size,
   EXPECT_EQ(bytes_read, data);
 }
 
+void TestTellInputStream(size_t compress_input_buf_size,
+                         size_t compress_output_buf_size,
+                         size_t uncompress_input_buf_size,
+                         size_t uncompress_output_buf_size,
+                         int num_copies = 1) {
+  Env* env = Env::Default();
+  string data;
+  string fname;
+
+  TestTellWriteFile(compress_input_buf_size, compress_output_buf_size,
+                    uncompress_input_buf_size, uncompress_output_buf_size,
+                    num_copies, fname, data);
+
+  tstring first_half(string(data, 0, data.size() / 2));
+  tstring bytes_read;
+  std::unique_ptr<RandomAccessFile> file_reader;
+  TF_CHECK_OK(env->NewRandomAccessFile(fname, &file_reader));
+  io::RandomAccessInputStream random_input_stream(file_reader.get(), false);
+  io::SnappyInputStream in(&random_input_stream, uncompress_output_buf_size);
+
+  // Read the first half of the uncompressed file and expect that Tell()
+  // returns half the uncompressed length of the file.
+  TF_CHECK_OK(in.ReadNBytes(first_half.size(), &bytes_read));
+  EXPECT_EQ(in.Tell(), first_half.size());
+  EXPECT_EQ(bytes_read, first_half);
+
+  // Read the remaining half of the uncompressed file and expect that
+  // Tell() points past the end of file.
+  tstring second_half;
+  TF_CHECK_OK(in.ReadNBytes(data.size() - first_half.size(), &second_half));
+  EXPECT_EQ(in.Tell(), data.size());
+  bytes_read.append(second_half);
+
+  // Expect that the file is correctly read.
+  EXPECT_EQ(bytes_read, data);
+}
+
 static bool SnappyCompressionSupported() {
   string out;
   StringPiece in = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa";
@@ -187,6 +297,7 @@ TEST(SnappyBuffers, MultipleWritesWithoutFlush) {
     return;
   }
   TF_CHECK_OK(TestMultipleWrites(10000, 10000, 10000, 10000, 2));
+  TF_CHECK_OK(TestMultipleWritesInputStream(10000, 10000, 10000, 10000, 2));
 }
 
 TEST(SnappyBuffers, MultipleWriteCallsWithFlush) {
@@ -195,6 +306,8 @@ TEST(SnappyBuffers, MultipleWriteCallsWithFlush) {
     return;
   }
   TF_CHECK_OK(TestMultipleWrites(10000, 10000, 10000, 10000, 2, true));
+  TF_CHECK_OK(
+      TestMultipleWritesInputStream(10000, 10000, 10000, 10000, 2, true));
 }
 
 TEST(SnappyBuffers, SmallUncompressInputBuffer) {
@@ -208,6 +321,17 @@ TEST(SnappyBuffers, SmallUncompressInputBuffer) {
                                      COMPRESSED_RECORD_SIZE, " bytes."));
 }
 
+TEST(SnappyBuffers, SmallUncompressInputStream) {
+  if (!SnappyCompressionSupported()) {
+    fprintf(stderr, "skipping compression tests\n");
+    return;
+  }
+  CHECK_EQ(TestMultipleWritesInputStream(10000, 10000, 10000, 10, 2, true),
+           errors::ResourceExhausted(
+               "Output buffer(size: 10 bytes) too small. ",
+               "Should be larger than ", GetRecord().size(), " bytes."));
+}
+
 TEST(SnappyBuffers, CorruptBlock) {
   if (!SnappyCompressionSupported()) {
     fprintf(stderr, "skipping compression tests\n");
@@ -218,6 +342,17 @@ TEST(SnappyBuffers, CorruptBlock) {
                             " bytes from file. ", "Possible data corruption."));
 }
 
+TEST(SnappyBuffers, CorruptBlockInputStream) {
+  if (!SnappyCompressionSupported()) {
+    fprintf(stderr, "skipping compression tests\n");
+    return;
+  }
+  CHECK_EQ(
+      TestMultipleWritesInputStream(10000, 10000, 700, 10000, 2, true, 1, true),
+      errors::DataLoss("Failed to read ", COMPRESSED_RECORD_SIZE,
+                       " bytes from file. ", "Possible data corruption."));
+}
+
 TEST(SnappyBuffers, CorruptBlockLargeInputBuffer) {
   if (!SnappyCompressionSupported()) {
     fprintf(stderr, "skipping compression tests\n");
@@ -227,6 +362,17 @@ TEST(SnappyBuffers, CorruptBlockLargeInputBuffer) {
            errors::OutOfRange("EOF reached"));
 }
 
+TEST(SnappyBuffers, CorruptBlockLargeInputStream) {
+  if (!SnappyCompressionSupported()) {
+    fprintf(stderr, "skipping compression tests\n");
+    return;
+  }
+  CHECK_EQ(TestMultipleWritesInputStream(10000, 10000, 2000, 10000, 2, true, 1,
+                                         true),
+           errors::DataLoss("Failed to read ", COMPRESSED_RECORD_SIZE,
+                            " bytes from file. Possible data corruption."));
+}
+
 TEST(SnappyBuffers, Tell) {
   if (!SnappyCompressionSupported()) {
     fprintf(stderr, "skipping compression tests\n");
@@ -235,4 +381,12 @@ TEST(SnappyBuffers, Tell) {
   TestTell(10000, 10000, 2000, 10000, 2);
 }
 
+TEST(SnappyBuffers, TellInputStream) {
+  if (!SnappyCompressionSupported()) {
+    fprintf(stderr, "skipping compression tests\n");
+    return;
+  }
+  TestTellInputStream(10000, 10000, 2000, 10000, 2);
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Acos.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Acos.pbtxt
index 3ed45186f6e..417dbfc7e7d 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Acos.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Acos.pbtxt
@@ -78,3 +78,32 @@ op {
     }
   }
 }
+op {
+  name: "Acos"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Asin.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Asin.pbtxt
index 7df768f7c66..c799ff99169 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Asin.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Asin.pbtxt
@@ -78,3 +78,32 @@ op {
     }
   }
 }
+op {
+  name: "Asin"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Atan.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Atan.pbtxt
index 86f0628ab53..4a80c7a751e 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Atan.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Atan.pbtxt
@@ -78,3 +78,32 @@ op {
     }
   }
 }
+op {
+  name: "Atan"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/BandedTriangularSolve.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/BandedTriangularSolve.pbtxt
new file mode 100644
index 00000000000..5cf85a62392
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/BandedTriangularSolve.pbtxt
@@ -0,0 +1,42 @@
+op {
+  name: "BandedTriangularSolve"
+  input_arg {
+    name: "matrix"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rhs"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "lower"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "adjoint"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/BesselI0.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/BesselI0.pbtxt
new file mode 100644
index 00000000000..78d524c916c
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/BesselI0.pbtxt
@@ -0,0 +1,23 @@
+op {
+  name: "BesselI0"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/BesselI1.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/BesselI1.pbtxt
new file mode 100644
index 00000000000..e756c4655dd
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/BesselI1.pbtxt
@@ -0,0 +1,23 @@
+op {
+  name: "BesselI1"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/BesselJ0.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/BesselJ0.pbtxt
new file mode 100644
index 00000000000..35e14e5fdf1
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/BesselJ0.pbtxt
@@ -0,0 +1,23 @@
+op {
+  name: "BesselJ0"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/BesselJ1.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/BesselJ1.pbtxt
new file mode 100644
index 00000000000..ef8814ea8f7
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/BesselJ1.pbtxt
@@ -0,0 +1,23 @@
+op {
+  name: "BesselJ1"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/BesselK0.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/BesselK0.pbtxt
new file mode 100644
index 00000000000..ebb364d0371
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/BesselK0.pbtxt
@@ -0,0 +1,23 @@
+op {
+  name: "BesselK0"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/BesselK0e.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/BesselK0e.pbtxt
new file mode 100644
index 00000000000..e3e680c9549
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/BesselK0e.pbtxt
@@ -0,0 +1,23 @@
+op {
+  name: "BesselK0e"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/BesselK1.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/BesselK1.pbtxt
new file mode 100644
index 00000000000..f7ca7c2f6e2
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/BesselK1.pbtxt
@@ -0,0 +1,23 @@
+op {
+  name: "BesselK1"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/BesselK1e.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/BesselK1e.pbtxt
new file mode 100644
index 00000000000..96fe68d7b7f
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/BesselK1e.pbtxt
@@ -0,0 +1,23 @@
+op {
+  name: "BesselK1e"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/BesselY0.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/BesselY0.pbtxt
new file mode 100644
index 00000000000..cd62af34773
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/BesselY0.pbtxt
@@ -0,0 +1,23 @@
+op {
+  name: "BesselY0"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/BesselY1.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/BesselY1.pbtxt
new file mode 100644
index 00000000000..06f4c08eaf6
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/BesselY1.pbtxt
@@ -0,0 +1,23 @@
+op {
+  name: "BesselY1"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Dawsn.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Dawsn.pbtxt
index 2167305da09..a2f8cba9d58 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Dawsn.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Dawsn.pbtxt
@@ -19,3 +19,26 @@ op {
     }
   }
 }
+op {
+  name: "Dawsn"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Expint.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Expint.pbtxt
index 9092141c828..3080bf15de9 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Expint.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Expint.pbtxt
@@ -19,3 +19,26 @@ op {
     }
   }
 }
+op {
+  name: "Expint"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/FresnelCos.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/FresnelCos.pbtxt
index 97a7278d3b2..7be5bbcb2ff 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/FresnelCos.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/FresnelCos.pbtxt
@@ -19,3 +19,26 @@ op {
     }
   }
 }
+op {
+  name: "FresnelCos"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/FresnelSin.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/FresnelSin.pbtxt
index 2a708da7f79..c8c91ba6a68 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/FresnelSin.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/FresnelSin.pbtxt
@@ -19,3 +19,26 @@ op {
     }
   }
 }
+op {
+  name: "FresnelSin"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Inv.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Inv.pbtxt
index ca208664617..0c191790030 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Inv.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Inv.pbtxt
@@ -168,3 +168,32 @@ op {
     }
   }
 }
+op {
+  name: "Inv"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Neg.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Neg.pbtxt
index 77bb4a5872d..864d0257fe4 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Neg.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Neg.pbtxt
@@ -78,3 +78,32 @@ op {
     }
   }
 }
+op {
+  name: "Neg"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Reciprocal.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Reciprocal.pbtxt
index 5ea1abe4c9c..7e03554871a 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Reciprocal.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Reciprocal.pbtxt
@@ -78,3 +78,32 @@ op {
     }
   }
 }
+op {
+  name: "Reciprocal"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Round.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Round.pbtxt
index 4f59b21afd5..c5685dc6143 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Round.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Round.pbtxt
@@ -78,3 +78,32 @@ op {
     }
   }
 }
+op {
+  name: "Round"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Spence.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Spence.pbtxt
index d449e456574..7032cac3dce 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Spence.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Spence.pbtxt
@@ -19,3 +19,26 @@ op {
     }
   }
 }
+op {
+  name: "Spence"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Square.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Square.pbtxt
index 4d07faf4fd0..6af75b3ddc1 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Square.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Square.pbtxt
@@ -78,3 +78,32 @@ op {
     }
   }
 }
+op {
+  name: "Square"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/StatelessParameterizedTruncatedNormal.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/StatelessParameterizedTruncatedNormal.pbtxt
new file mode 100644
index 00000000000..598125677b1
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/StatelessParameterizedTruncatedNormal.pbtxt
@@ -0,0 +1,65 @@
+op {
+  name: "StatelessParameterizedTruncatedNormal"
+  input_arg {
+    name: "shape"
+    type_attr: "S"
+  }
+  input_arg {
+    name: "seed"
+    type_attr: "Tseed"
+  }
+  input_arg {
+    name: "means"
+    type_attr: "dtype"
+  }
+  input_arg {
+    name: "stddevs"
+    type_attr: "dtype"
+  }
+  input_arg {
+    name: "minvals"
+    type_attr: "dtype"
+  }
+  input_arg {
+    name: "maxvals"
+    type_attr: "dtype"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "S"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "Tseed"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Tan.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Tan.pbtxt
index 7dc7f84fd38..80e0b1e22c4 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Tan.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Tan.pbtxt
@@ -78,3 +78,32 @@ op {
     }
   }
 }
+op {
+  name: "Tan"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/image_ops.cc b/tensorflow/core/ops/image_ops.cc
index e11f14b8538..43ee65c4ab4 100644
--- a/tensorflow/core/ops/image_ops.cc
+++ b/tensorflow/core/ops/image_ops.cc
@@ -87,6 +87,40 @@ Status DecodeImageShapeFn(InferenceContext* c) {
   return Status::OK();
 }
 
+Status DecodeImageV2ShapeFn(InferenceContext* c) {
+  ShapeHandle unused;
+  int32 channels;
+  bool expand_animations;
+  DimensionHandle channels_dim;
+
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &unused));
+  TF_RETURN_IF_ERROR(c->GetAttr("channels", &channels));
+  TF_RETURN_IF_ERROR(c->GetAttr("expand_animations", &expand_animations));
+
+  if (channels == 0) {
+    channels_dim = c->UnknownDim();
+  } else {
+    if (channels < 0) {
+      return errors::InvalidArgument("channels must be non-negative, got ",
+                                     channels);
+    }
+    channels_dim = c->MakeDim(channels);
+  }
+
+  // `expand_animations` set to true will return 4-D shapes for GIF. 3-D shapes
+  // will be returned for jpg, png, and bmp. `expand_animations` set to false
+  // will always return 3-D shapes for all (jpg, png, bmp, gif).
+  if (expand_animations) {
+    c->set_output(0, c->UnknownShape());
+    return Status::OK();
+  } else {
+    c->set_output(0,
+                  c->MakeShape({InferenceContext::kUnknownDim,
+                                InferenceContext::kUnknownDim, channels_dim}));
+    return Status::OK();
+  }
+}
+
 Status EncodeImageShapeFn(InferenceContext* c) {
   ShapeHandle unused;
   TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 3, &unused));
@@ -412,6 +446,17 @@ REGISTER_OP("RandomCrop")
     });
 // TODO(shlens): Support variable rank in RandomCrop.
 
+// --------------------------------------------------------------------------
+REGISTER_OP("DecodeImage")
+    .Input("contents: string")
+    // Setting `channels` to 0 means using the inherent number of channels in
+    // the image.
+    .Attr("channels: int = 0")
+    .Attr("dtype: {uint8, uint16, float32} = DT_UINT8")
+    .Output("image: dtype")
+    .Attr("expand_animations: bool = true")
+    .SetShapeFn(DecodeImageV2ShapeFn);
+
 // --------------------------------------------------------------------------
 REGISTER_OP("DecodeJpeg")
     .Input("contents: string")
diff --git a/tensorflow/core/ops/image_ops_test.cc b/tensorflow/core/ops/image_ops_test.cc
index e517e750955..4d0c1fceb28 100644
--- a/tensorflow/core/ops/image_ops_test.cc
+++ b/tensorflow/core/ops/image_ops_test.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/shape_inference_testutil.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
 
@@ -61,6 +62,34 @@ TEST(ImageOpsTest, DecodeGif) {
   INFER_OK(op, "[]", "[?,?,?,3]");
 }
 
+TEST(ImageOpTest, DecodeImage) {
+  ShapeInferenceTestOp op("DecodeImage");
+
+  // Rank check.
+  INFER_ERROR("Shape must be rank 0 but is rank 1", op, "[1]");
+
+  // Set `expand_animations` to false. Output is always ?,?,?.
+  TF_ASSERT_OK(NodeDefBuilder("test", "DecodeImage")
+                   .Input({"img", 0, DT_STRING})
+                   .Attr("expand_animations", false)
+                   .Finalize(&op.node_def));
+  INFER_OK(op, "[]", "[?,?,?]");
+
+  // Set `expand_animations` to false. Output shape is not known (3D or 4D).
+  TF_ASSERT_OK(NodeDefBuilder("test", "DecodeImage")
+                   .Input({"img", 0, DT_STRING})
+                   .Attr("expand_animations", true)
+                   .Finalize(&op.node_def));
+  INFER_OK(op, "[]", "?");
+
+  // Negative channel value is rejected.
+  TF_ASSERT_OK(NodeDefBuilder("test", "DecodeImage")
+                   .Input({"img", 0, DT_STRING})
+                   .Attr("channels", -1)
+                   .Finalize(&op.node_def));
+  INFER_ERROR("channels must be non-negative, got -1", op, "[]");
+}
+
 TEST(ImageOpsTest, DecodeImage_ShapeFn) {
   for (const char* op_name : {"DecodeJpeg", "DecodePng"}) {
     ShapeInferenceTestOp op(op_name);
@@ -325,8 +354,8 @@ TEST(ImageOpsTest, DrawBoundingBoxes_ShapeFn) {
 
   // Check images.
   INFER_ERROR("must be rank 4", op, "[1,?,3];?");
-  INFER_ERROR("should be either 1 (GRY), 3 (RGB), or 4 (RGBA)",
-      op, "[1,?,?,5];?");
+  INFER_ERROR("should be either 1 (GRY), 3 (RGB), or 4 (RGBA)", op,
+              "[1,?,?,5];?");
 
   // Check boxes.
   INFER_ERROR("must be rank 3", op, "[1,?,?,4];[1,4]");
diff --git a/tensorflow/core/ops/linalg_ops.cc b/tensorflow/core/ops/linalg_ops.cc
index 75340b28eb0..a05231834b7 100644
--- a/tensorflow/core/ops/linalg_ops.cc
+++ b/tensorflow/core/ops/linalg_ops.cc
@@ -47,6 +47,49 @@ Status BatchUnchangedSquareShapeFn(InferenceContext* c) {
   return Status::OK();
 }
 
+// The first input is [...,K,M] and second input is [...,M,N].
+Status BandedTriangularSolveShapeFn(InferenceContext* c) {
+  ShapeHandle lhs;
+  ShapeHandle rhs;
+
+  TF_RETURN_IF_ERROR(c->WithRankAtLeast(c->input(0), 2, &lhs));
+  TF_RETURN_IF_ERROR(c->WithRankAtLeast(c->input(1), 2, &rhs));
+
+  // Check K > 0.
+  DimensionHandle num_bands = c->Dim(lhs, -2);
+  DimensionHandle m = c->Dim(lhs, -1);
+  if (c->ValueKnown(num_bands) && c->Value(num_bands) <= 0) {
+    return errors::InvalidArgument("Number of bands must be positive, but is ",
+                                   c->Value(num_bands));
+  }
+  if (c->ValueKnown(num_bands) && c->ValueKnown(m) &&
+      c->Value(num_bands) > c->Value(m)) {
+    return errors::InvalidArgument("Number of bands ", c->Value(num_bands),
+                                   " cannot exceed the size of the matrix ",
+                                   c->Value(m));
+  }
+
+  ShapeHandle lhs_batch_shape;
+  ShapeHandle rhs_batch_shape;
+  ShapeHandle output_batch_shape;
+  // Make the common batch subshape.
+  TF_RETURN_IF_ERROR(c->Subshape(lhs, 0, -2, &lhs_batch_shape));
+  TF_RETURN_IF_ERROR(c->Subshape(rhs, 0, -2, &rhs_batch_shape));
+  TF_RETURN_IF_ERROR(BroadcastBinaryOpOutputShapeFnHelper(
+      c, lhs_batch_shape, rhs_batch_shape, true, &output_batch_shape));
+
+  // lhs and rhs have the same value for M to be compatible.
+  TF_RETURN_IF_ERROR(c->Merge(m, c->Dim(rhs, -2), &m));
+
+  // Build final shape (batch_shape + m + n) in <out>.
+  ShapeHandle out;
+  TF_RETURN_IF_ERROR(
+      c->Concatenate(output_batch_shape, c->Matrix(m, c->Dim(rhs, -1)), &out));
+
+  c->set_output(0, out);
+  return Status::OK();
+}
+
 // The first input is [...,M,N] and second input is either [...,M,K] or [...,M].
 // Output is [...,N,K] or [...,N]. If <square>, then input is [...,M,M].
 Status MatrixSolveShapeFn(InferenceContext* c, bool square) {
@@ -446,6 +489,17 @@ REGISTER_OP("MatrixSolve")
       return MatrixSolveShapeFn(c, true /* square (*/);
     });
 
+REGISTER_OP("BandedTriangularSolve")
+    .Input("matrix: T")
+    .Input("rhs: T")
+    .Output("output: T")
+    .Attr("lower: bool = True")
+    .Attr("adjoint: bool = False")
+    .Attr("T: {double, float, half, complex64, complex128}")
+    .SetShapeFn([](InferenceContext* c) {
+      return BandedTriangularSolveShapeFn(c);
+    });
+
 REGISTER_OP("MatrixTriangularSolve")
     .Input("matrix: T")
     .Input("rhs: T")
diff --git a/tensorflow/core/ops/math_ops.cc b/tensorflow/core/ops/math_ops.cc
index b3c654bdb1d..2a70f420260 100644
--- a/tensorflow/core/ops/math_ops.cc
+++ b/tensorflow/core/ops/math_ops.cc
@@ -201,12 +201,12 @@ REGISTER_OP("ComplexAbs")
     .SetShapeFn(shape_inference::UnchangedShape);
 
 // Declares cwise unary operations signature: 't -> 't
-#define UNARY()                                                          \
-  Input("x: T")                                                          \
-      .Output("y: T")                                                    \
-      .Attr(                                                             \
-          "T: {bfloat16, half, float, double, int32, int64, complex64, " \
-          "complex128}")                                                 \
+#define UNARY()                                                            \
+  Input("x: T")                                                            \
+      .Output("y: T")                                                      \
+      .Attr(                                                               \
+          "T: {bfloat16, half, float, double, int8, int16, int32, int64, " \
+          "complex64, complex128}")                                        \
       .SetShapeFn(shape_inference::UnchangedShape)
 
 #define UNARY_REAL()                              \
@@ -297,10 +297,6 @@ REGISTER_OP("Acos").UNARY();
 
 REGISTER_OP("Atan").UNARY();
 
-REGISTER_OP("BesselI0e").UNARY_REAL();
-
-REGISTER_OP("BesselI1e").UNARY_REAL();
-
 REGISTER_OP("_UnaryOpsComposition")
     .Input("x: T")
     .Output("y: T")
diff --git a/tensorflow/core/ops/math_ops_test.cc b/tensorflow/core/ops/math_ops_test.cc
index 5c69a2a7f1c..2b65f88042c 100644
--- a/tensorflow/core/ops/math_ops_test.cc
+++ b/tensorflow/core/ops/math_ops_test.cc
@@ -120,7 +120,7 @@ TEST(MathOpsTest, BroadcastBinaryOps_ShapeFn) {
     INFER_OK(op, "[1];[?]", "[d1_0]");
     INFER_OK(op, "[?];[2]", incompatible_shape_error ? "[d1_0]" : "?");
     INFER_OK(op, "[2];[?]", incompatible_shape_error ? "[d0_0]" : "?");
-    INFER_OK(op, "[?];[?]", incompatible_shape_error ? "[?]" : "?");
+    INFER_OK(op, "[?];[?]", "[?]");
     INFER_OK(op, "[];[?]", "[d1_0]");
     INFER_OK(op, "[?];[]", "[d0_0]");
 
@@ -604,4 +604,22 @@ TEST(MathOpsTest, SobolSample) {
 
   INFER_OK(op, "[];[];[]", "[?,?]");
 }
+
+TEST(MathOpsTest, EqualOp) {
+  ShapeInferenceTestOp op("Equal");
+  AddNodeAttr("incompatible_shape_error", true, &op.node_def);
+
+  INFER_OK(op, "?;?", "?");
+  INFER_OK(op, "[1,2];?", "?");
+  INFER_OK(op, "?;[1,2]", "?");
+
+  INFER_OK(op, "[1,2,3];[1]", "[d0_0,d0_1,d0_2]");
+  INFER_OK(op, "[?,2,1];[1,3]", "[d0_0,d0_1,d1_1]");
+  INFER_OK(op, "[1,?,3];[3,1]", "[d0_0,d1_0,d0_2]");
+  INFER_OK(op, "[1,2,3];[2,1,3]", "[d1_0,d0_1,d0_2]");
+
+  // Note: Test case for GitHub issue 40471
+  INFER_OK(op, "[?,10,1];[?,1,4]", "[?,d0_1,d1_2]");
+  INFER_OK(op, "[10,?,1];[1,?,4]", "[d0_0,?,d1_2]");
+}
 }  // end namespace tensorflow
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index 1d92a0671b8..dbd91c91b65 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -216,6 +216,8 @@ op {
         type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_INT8
+        type: DT_INT16
         type: DT_INT32
         type: DT_INT64
         type: DT_COMPLEX64
@@ -2333,6 +2335,8 @@ op {
         type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_INT8
+        type: DT_INT16
         type: DT_INT32
         type: DT_INT64
         type: DT_COMPLEX64
@@ -2646,6 +2650,8 @@ op {
         type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_INT8
+        type: DT_INT16
         type: DT_INT32
         type: DT_INT64
         type: DT_COMPLEX64
@@ -3070,6 +3076,48 @@ op {
     }
   }
 }
+op {
+  name: "BandedTriangularSolve"
+  input_arg {
+    name: "matrix"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rhs"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "lower"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "adjoint"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "Barrier"
   output_arg {
@@ -4303,6 +4351,29 @@ op {
     }
   }
 }
+op {
+  name: "BesselI0"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
 op {
   name: "BesselI0e"
   input_arg {
@@ -4326,6 +4397,29 @@ op {
     }
   }
 }
+op {
+  name: "BesselI1"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
 op {
   name: "BesselI1e"
   input_arg {
@@ -4349,6 +4443,190 @@ op {
     }
   }
 }
+op {
+  name: "BesselJ0"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "BesselJ1"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "BesselK0"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "BesselK0e"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "BesselK1"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "BesselK1e"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "BesselY0"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "BesselY1"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
 op {
   name: "Betainc"
   input_arg {
@@ -10718,6 +10996,8 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -15093,6 +15373,8 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -16580,6 +16862,8 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -16601,6 +16885,8 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -19162,6 +19448,8 @@ op {
         type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_INT8
+        type: DT_INT16
         type: DT_INT32
         type: DT_INT64
         type: DT_COMPLEX64
@@ -25218,6 +25506,8 @@ op {
         type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_INT8
+        type: DT_INT16
         type: DT_INT32
         type: DT_INT64
         type: DT_COMPLEX64
@@ -34911,6 +35201,8 @@ op {
         type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_INT8
+        type: DT_INT16
         type: DT_INT32
         type: DT_INT64
         type: DT_COMPLEX64
@@ -40406,6 +40698,8 @@ op {
         type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_INT8
+        type: DT_INT16
         type: DT_INT32
         type: DT_INT64
         type: DT_COMPLEX64
@@ -47609,6 +47903,8 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -47789,6 +48085,8 @@ op {
         type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_INT8
+        type: DT_INT16
         type: DT_INT32
         type: DT_INT64
         type: DT_COMPLEX64
@@ -48609,6 +48907,71 @@ op {
     }
   }
 }
+op {
+  name: "StatelessParameterizedTruncatedNormal"
+  input_arg {
+    name: "shape"
+    type_attr: "S"
+  }
+  input_arg {
+    name: "seed"
+    type_attr: "Tseed"
+  }
+  input_arg {
+    name: "means"
+    type_attr: "dtype"
+  }
+  input_arg {
+    name: "stddevs"
+    type_attr: "dtype"
+  }
+  input_arg {
+    name: "minvals"
+    type_attr: "dtype"
+  }
+  input_arg {
+    name: "maxvals"
+    type_attr: "dtype"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "S"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "Tseed"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
 op {
   name: "StatelessRandomBinomial"
   input_arg {
@@ -50485,6 +50848,8 @@ op {
         type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_INT8
+        type: DT_INT16
         type: DT_INT32
         type: DT_INT64
         type: DT_COMPLEX64
diff --git a/tensorflow/core/ops/special_math_ops.cc b/tensorflow/core/ops/special_math_ops.cc
index 1bef65b622b..6d86da849af 100644
--- a/tensorflow/core/ops/special_math_ops.cc
+++ b/tensorflow/core/ops/special_math_ops.cc
@@ -20,34 +20,33 @@ limitations under the License.
 
 namespace tensorflow {
 
-REGISTER_OP("Dawsn")
-    .Input("x: T")
-    .Output("y: T")
-    .Attr("T: {float, double}")
-    .SetShapeFn(shape_inference::UnchangedShape);
+#define UNARY_REAL()                              \
+  Input("x: T")                                   \
+      .Output("y: T")                             \
+      .Attr("T: {bfloat16, half, float, double}") \
+      .SetShapeFn(shape_inference::UnchangedShape)
 
-REGISTER_OP("Expint")
-    .Input("x: T")
-    .Output("y: T")
-    .Attr("T: {float, double}")
-    .SetShapeFn(shape_inference::UnchangedShape);
+REGISTER_OP("Dawsn").UNARY_REAL();
+REGISTER_OP("Expint").UNARY_REAL();
+REGISTER_OP("FresnelCos").UNARY_REAL();
+REGISTER_OP("FresnelSin").UNARY_REAL();
+REGISTER_OP("Spence").UNARY_REAL();
 
-REGISTER_OP("FresnelCos")
-    .Input("x: T")
-    .Output("y: T")
-    .Attr("T: {float, double}")
-    .SetShapeFn(shape_inference::UnchangedShape);
+// Bessel functions
 
-REGISTER_OP("FresnelSin")
-    .Input("x: T")
-    .Output("y: T")
-    .Attr("T: {float, double}")
-    .SetShapeFn(shape_inference::UnchangedShape);
+REGISTER_OP("BesselI0").UNARY_REAL();
+REGISTER_OP("BesselI1").UNARY_REAL();
+REGISTER_OP("BesselI0e").UNARY_REAL();
+REGISTER_OP("BesselI1e").UNARY_REAL();
 
-REGISTER_OP("Spence")
-    .Input("x: T")
-    .Output("y: T")
-    .Attr("T: {float, double}")
-    .SetShapeFn(shape_inference::UnchangedShape);
+REGISTER_OP("BesselK0").UNARY_REAL();
+REGISTER_OP("BesselK1").UNARY_REAL();
+REGISTER_OP("BesselK0e").UNARY_REAL();
+REGISTER_OP("BesselK1e").UNARY_REAL();
+
+REGISTER_OP("BesselJ0").UNARY_REAL();
+REGISTER_OP("BesselJ1").UNARY_REAL();
+REGISTER_OP("BesselY0").UNARY_REAL();
+REGISTER_OP("BesselY1").UNARY_REAL();
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/ops/stateless_random_ops.cc b/tensorflow/core/ops/stateless_random_ops.cc
index d540b9a04d9..e1820ea4feb 100644
--- a/tensorflow/core/ops/stateless_random_ops.cc
+++ b/tensorflow/core/ops/stateless_random_ops.cc
@@ -124,6 +124,41 @@ REGISTER_OP("StatelessRandomBinomial")
     .Attr("dtype: {half, float, double, int32, int64} = DT_INT64")
     .SetShapeFn(StatelessShape);
 
+REGISTER_OP("StatelessParameterizedTruncatedNormal")
+    .Input("shape: S")
+    .Input("seed: Tseed")
+    .Input("means: dtype")
+    .Input("stddevs: dtype")
+    .Input("minvals: dtype")
+    .Input("maxvals: dtype")
+    .Output("output: dtype")
+    .Attr("S: {int32, int64}")
+    .Attr("Tseed: {int32, int64} = DT_INT64")
+    .Attr("dtype: {float16, float32, float64}")
+    .SetShapeFn([](InferenceContext* c) {
+      // Check seed shape
+      ShapeHandle seed;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 1, &seed));
+      DimensionHandle unused_dim;
+      TF_RETURN_IF_ERROR(c->WithValue(c->Dim(seed, 0), 2, &unused_dim));
+
+      ShapeHandle bcast_means_stddevs;
+      ShapeHandle bcast_except_maxvals;
+      ShapeHandle bcast_all;
+      TF_RETURN_IF_ERROR(BroadcastBinaryOpOutputShapeFnHelper(
+          c, c->input(2), c->input(3), true, &bcast_means_stddevs));
+      TF_RETURN_IF_ERROR(BroadcastBinaryOpOutputShapeFnHelper(
+          c, c->input(4), bcast_means_stddevs, true, &bcast_except_maxvals));
+      TF_RETURN_IF_ERROR(BroadcastBinaryOpOutputShapeFnHelper(
+          c, c->input(5), bcast_except_maxvals, true, &bcast_all));
+
+      // Set output shape
+      ShapeHandle out;
+      TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(0, &out));
+      c->set_output(0, out);
+      return Status::OK();
+    });
+
 REGISTER_OP("StatelessRandomPoisson")
     .Input("shape: T")
     .Input("seed: Tseed")
diff --git a/tensorflow/core/platform/BUILD b/tensorflow/core/platform/BUILD
index 70bb8a89417..33a1e7cfe0a 100644
--- a/tensorflow/core/platform/BUILD
+++ b/tensorflow/core/platform/BUILD
@@ -938,6 +938,13 @@ cc_library(
     alwayslink = 1,
 )
 
+cc_library(
+    name = "tf32_utils",
+    srcs = ["tf32_utils.cc"],
+    hdrs = ["tf32_utils.h"],
+    copts = tf_copts(),
+)
+
 tf_cc_tests(
     name = "low_level_library_tests",
     size = "small",
diff --git a/tensorflow/core/platform/cloud/curl_http_request.cc b/tensorflow/core/platform/cloud/curl_http_request.cc
index 6b91d615f6f..3ae121abafe 100644
--- a/tensorflow/core/platform/cloud/curl_http_request.cc
+++ b/tensorflow/core/platform/cloud/curl_http_request.cc
@@ -168,7 +168,9 @@ CurlHttpRequest::~CurlHttpRequest() {
     libcurl_->curl_slist_free_all(resolve_list_);
   }
   if (put_body_) {
-    fclose(put_body_);
+    if (fclose(put_body_) != 0) {
+      LOG(ERROR) << "fclose() failed: " << strerror(errno);
+    }
   }
   if (curl_) {
     libcurl_->curl_easy_cleanup(curl_);
@@ -239,7 +241,9 @@ Status CurlHttpRequest::SetPutFromFile(const string& body_filepath,
   is_method_set_ = true;
   method_ = RequestMethod::kPut;
   if (put_body_) {
-    fclose(put_body_);
+    if (fclose(put_body_) != 0) {
+      LOG(ERROR) << "fclose() failed: " << strerror(errno);
+    }
   }
   put_body_ = fopen(body_filepath.c_str(), "r");
   if (!put_body_) {
diff --git a/tensorflow/core/platform/cloud/gcs_file_system.cc b/tensorflow/core/platform/cloud/gcs_file_system.cc
index 5d395f3d821..1bd4d86eef6 100644
--- a/tensorflow/core/platform/cloud/gcs_file_system.cc
+++ b/tensorflow/core/platform/cloud/gcs_file_system.cc
@@ -14,14 +14,18 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/platform/cloud/gcs_file_system.h"
+
 #include <stdio.h>
 #include <unistd.h>
+
 #include <algorithm>
 #include <cstdio>
 #include <cstdlib>
 #include <cstring>
 #include <fstream>
 #include <vector>
+
+#include "tensorflow/core/platform/strcat.h"
 #ifdef _WIN32
 #include <io.h>  // for _mktemp
 #endif
@@ -128,6 +132,15 @@ constexpr char kAllowedBucketLocations[] = "GCS_ALLOWED_BUCKET_LOCATIONS";
 // is running in and restricts to buckets in that region.
 constexpr char kDetectZoneSentinelValue[] = "auto";
 
+// How to upload new data when Flush() is called multiple times.
+// By default the entire file is reuploaded.
+constexpr char kAppendMode[] = "GCS_APPEND_MODE";
+// If GCS_APPEND_MODE=compose then instead the new data is uploaded to a
+// temporary object and composed with the original object. This is disabled by
+// default as the multiple API calls required add a risk of stranding temporary
+// objects.
+constexpr char kComposeAppend[] = "compose";
+
 Status GetTmpFilename(string* filename) {
   *filename = io::GetTempFilename("");
   return Status::OK();
@@ -379,15 +392,18 @@ class GcsWritableFile : public WritableFile {
                   GcsFileSystem* filesystem,
                   GcsFileSystem::TimeoutConfig* timeouts,
                   std::function<void()> file_cache_erase,
-                  RetryConfig retry_config)
+                  RetryConfig retry_config, bool compose_append)
       : bucket_(bucket),
         object_(object),
         filesystem_(filesystem),
         timeouts_(timeouts),
         file_cache_erase_(std::move(file_cache_erase)),
         sync_needed_(true),
-        retry_config_(retry_config) {
+        retry_config_(retry_config),
+        compose_append_(compose_append),
+        start_offset_(0) {
     // TODO: to make it safer, outfile_ should be constructed from an FD
+    VLOG(3) << "GcsWritableFile: " << GetGcsPath();
     if (GetTmpFilename(&tmp_content_filename_).ok()) {
       outfile_.open(tmp_content_filename_,
                     std::ofstream::binary | std::ofstream::app);
@@ -403,14 +419,18 @@ class GcsWritableFile : public WritableFile {
                   GcsFileSystem* filesystem, const string& tmp_content_filename,
                   GcsFileSystem::TimeoutConfig* timeouts,
                   std::function<void()> file_cache_erase,
-                  RetryConfig retry_config)
+                  RetryConfig retry_config, bool compose_append)
       : bucket_(bucket),
         object_(object),
         filesystem_(filesystem),
         timeouts_(timeouts),
         file_cache_erase_(std::move(file_cache_erase)),
         sync_needed_(true),
-        retry_config_(retry_config) {
+        retry_config_(retry_config),
+        compose_append_(compose_append),
+        start_offset_(0) {
+    VLOG(3) << "GcsWritableFile: " << GetGcsPath() << "with existing file "
+            << tmp_content_filename;
     tmp_content_filename_ = tmp_content_filename;
     outfile_.open(tmp_content_filename_,
                   std::ofstream::binary | std::ofstream::app);
@@ -423,6 +443,7 @@ class GcsWritableFile : public WritableFile {
 
   Status Append(StringPiece data) override {
     TF_RETURN_IF_ERROR(CheckWritable());
+    VLOG(3) << "Append: " << GetGcsPath() << " size " << data.length();
     sync_needed_ = true;
     outfile_ << data;
     if (!outfile_.good()) {
@@ -433,6 +454,7 @@ class GcsWritableFile : public WritableFile {
   }
 
   Status Close() override {
+    VLOG(3) << "Close:" << GetGcsPath();
     if (outfile_.is_open()) {
       Status sync_status = Sync();
       if (sync_status.ok()) {
@@ -443,18 +465,23 @@ class GcsWritableFile : public WritableFile {
     return Status::OK();
   }
 
-  Status Flush() override { return Sync(); }
+  Status Flush() override {
+    VLOG(3) << "Flush:" << GetGcsPath();
+    return Sync();
+  }
 
   Status Name(StringPiece* result) const override {
     return errors::Unimplemented("GCSWritableFile does not support Name()");
   }
 
   Status Sync() override {
+    VLOG(3) << "Sync started:" << GetGcsPath();
     TF_RETURN_IF_ERROR(CheckWritable());
     if (!sync_needed_) {
       return Status::OK();
     }
     Status status = SyncImpl();
+    VLOG(3) << "Sync finished " << GetGcsPath();
     if (status.ok()) {
       sync_needed_ = false;
     }
@@ -483,11 +510,26 @@ class GcsWritableFile : public WritableFile {
           "Could not write to the internal temporary file.");
     }
     string session_uri;
-    TF_RETURN_IF_ERROR(CreateNewUploadSession(&session_uri));
+    uint64 start_offset = 0;
+    string object_to_upload = object_;
+    bool should_compose = false;
+    if (compose_append_) {
+      start_offset = start_offset_;
+      // Only compose if the object has already been uploaded to GCS
+      should_compose = start_offset > 0;
+      if (should_compose) {
+        object_to_upload =
+            strings::StrCat(io::Dirname(object_), "/.tmpcompose/",
+                            io::Basename(object_), ".", start_offset_);
+      }
+    }
+    TF_RETURN_IF_ERROR(
+        CreateNewUploadSession(&session_uri, start_offset, object_to_upload));
     uint64 already_uploaded = 0;
     bool first_attempt = true;
     const Status upload_status = RetryingUtils::CallWithRetries(
-        [&first_attempt, &already_uploaded, &session_uri, this]() {
+        [&first_attempt, &already_uploaded, &session_uri, &start_offset,
+         this]() {
           if (!first_attempt) {
             bool completed;
             TF_RETURN_IF_ERROR(RequestUploadSessionStatus(
@@ -502,7 +544,7 @@ class GcsWritableFile : public WritableFile {
             }
           }
           first_attempt = false;
-          return UploadToSession(session_uri, already_uploaded);
+          return UploadToSession(session_uri, start_offset, already_uploaded);
         },
         retry_config_);
     if (upload_status.code() == errors::Code::NOT_FOUND) {
@@ -512,6 +554,12 @@ class GcsWritableFile : public WritableFile {
           strings::StrCat("Upload to gs://", bucket_, "/", object_,
                           " failed, caused by: ", upload_status.ToString()));
     }
+    if (upload_status.ok()) {
+      if (should_compose) {
+        TF_RETURN_IF_ERROR(AppendObject(object_to_upload));
+      }
+      TF_RETURN_IF_ERROR(GetCurrentFileSize(&start_offset_));
+    }
     return upload_status;
   }
 
@@ -534,7 +582,8 @@ class GcsWritableFile : public WritableFile {
   }
 
   /// Initiates a new resumable upload session.
-  Status CreateNewUploadSession(string* session_uri) {
+  Status CreateNewUploadSession(string* session_uri, uint64 start_offset,
+                                string object_to_upload) {
     uint64 file_size;
     TF_RETURN_IF_ERROR(GetCurrentFileSize(&file_size));
 
@@ -542,10 +591,11 @@ class GcsWritableFile : public WritableFile {
     std::unique_ptr<HttpRequest> request;
     TF_RETURN_IF_ERROR(filesystem_->CreateHttpRequest(&request));
 
-    request->SetUri(strings::StrCat(
-        kGcsUploadUriBase, "b/", bucket_,
-        "/o?uploadType=resumable&name=", request->EscapeString(object_)));
-    request->AddHeader("X-Upload-Content-Length", std::to_string(file_size));
+    request->SetUri(strings::StrCat(kGcsUploadUriBase, "b/", bucket_,
+                                    "/o?uploadType=resumable&name=",
+                                    request->EscapeString(object_to_upload)));
+    request->AddHeader("X-Upload-Content-Length",
+                       std::to_string(file_size - start_offset));
     request->SetPostEmptyBody();
     request->SetResultBuffer(&output_buffer);
     request->SetTimeouts(timeouts_->connect, timeouts_->idle,
@@ -561,6 +611,37 @@ class GcsWritableFile : public WritableFile {
     return Status::OK();
   }
 
+  /// Appends the data of append_object to the original object and deletes
+  /// append_object.
+  Status AppendObject(string append_object) {
+    VLOG(3) << "AppendObject: " << GetGcsPathWithObject(append_object) << " to "
+            << GetGcsPath();
+    std::unique_ptr<HttpRequest> request;
+    TF_RETURN_IF_ERROR(filesystem_->CreateHttpRequest(&request));
+
+    request->SetUri(strings::StrCat(kGcsUriBase, "b/", bucket_, "/o/",
+                                    request->EscapeString(object_),
+                                    "/compose"));
+
+    const string request_body =
+        strings::StrCat("{'sourceObjects': [{'name': '", object_,
+                        "'},{'name': '", append_object, "'}]}");
+    request->SetTimeouts(timeouts_->connect, timeouts_->idle,
+                         timeouts_->metadata);
+    request->AddHeader("content-type", "application/json");
+    request->SetPostFromBuffer(request_body.c_str(), request_body.size());
+    return RetryingUtils::CallWithRetries(
+        [&request, &append_object, this]() {
+          TF_RETURN_WITH_CONTEXT_IF_ERROR(request->Send(),
+                                          " when composing to ", GetGcsPath());
+          TF_RETURN_WITH_CONTEXT_IF_ERROR(
+              filesystem_->DeleteFile(GetGcsPathWithObject(append_object)),
+              " when cleaning up.");
+          return Status::OK();
+        },
+        retry_config_);
+  }
+
   /// \brief Requests status of a previously initiated upload session.
   ///
   /// If the upload has already succeeded, sets 'completed' to true.
@@ -628,7 +709,8 @@ class GcsWritableFile : public WritableFile {
     return Status::OK();
   }
 
-  Status UploadToSession(const string& session_uri, uint64 start_offset) {
+  Status UploadToSession(const string& session_uri, uint64 start_offset,
+                         uint64 already_uploaded) {
     uint64 file_size;
     TF_RETURN_IF_ERROR(GetCurrentFileSize(&file_size));
 
@@ -637,13 +719,14 @@ class GcsWritableFile : public WritableFile {
     request->SetUri(session_uri);
     if (file_size > 0) {
       request->AddHeader("Content-Range",
-                         strings::StrCat("bytes ", start_offset, "-",
-                                         file_size - 1, "/", file_size));
+                         strings::StrCat("bytes ", already_uploaded, "-",
+                                         file_size - start_offset - 1, "/",
+                                         file_size - start_offset));
     }
     request->SetTimeouts(timeouts_->connect, timeouts_->idle, timeouts_->write);
 
-    TF_RETURN_IF_ERROR(
-        request->SetPutFromFile(tmp_content_filename_, start_offset));
+    TF_RETURN_IF_ERROR(request->SetPutFromFile(
+        tmp_content_filename_, start_offset + already_uploaded));
     TF_RETURN_WITH_CONTEXT_IF_ERROR(request->Send(), " when uploading ",
                                     GetGcsPath());
     // Erase the file from the file cache on every successful write.
@@ -651,9 +734,10 @@ class GcsWritableFile : public WritableFile {
     return Status::OK();
   }
 
-  string GetGcsPath() const {
-    return strings::StrCat("gs://", bucket_, "/", object_);
+  string GetGcsPathWithObject(string object) const {
+    return strings::StrCat("gs://", bucket_, "/", object);
   }
+  string GetGcsPath() const { return GetGcsPathWithObject(object_); }
 
   string bucket_;
   string object_;
@@ -664,6 +748,8 @@ class GcsWritableFile : public WritableFile {
   std::function<void()> file_cache_erase_;
   bool sync_needed_;  // whether there is buffered data that needs to be synced
   RetryConfig retry_config_;
+  bool compose_append_;
+  uint64 start_offset_;
 };
 
 class GcsReadOnlyMemoryRegion : public ReadOnlyMemoryRegion {
@@ -849,6 +935,14 @@ GcsFileSystem::GcsFileSystem(bool make_default_cache) {
 
   GetEnvVar(kAllowedBucketLocations, SplitByCommaToLowercaseSet,
             &allowed_locations_);
+
+  StringPiece append_mode;
+  GetEnvVar(kAppendMode, StringPieceIdentity, &append_mode);
+  if (append_mode == kComposeAppend) {
+    compose_append_ = true;
+  } else {
+    compose_append_ = false;
+  }
 }
 
 GcsFileSystem::GcsFileSystem(
@@ -859,7 +953,8 @@ GcsFileSystem::GcsFileSystem(
     size_t stat_cache_max_entries, uint64 matching_paths_cache_max_age,
     size_t matching_paths_cache_max_entries, RetryConfig retry_config,
     TimeoutConfig timeouts, const std::unordered_set<string>& allowed_locations,
-    std::pair<const string, const string>* additional_header)
+    std::pair<const string, const string>* additional_header,
+    bool compose_append)
     : auth_provider_(std::move(auth_provider)),
       http_request_factory_(std::move(http_request_factory)),
       zone_provider_(std::move(zone_provider)),
@@ -872,6 +967,7 @@ GcsFileSystem::GcsFileSystem(
       bucket_location_cache_(new BucketLocationCache(
           kCacheNeverExpire, kBucketLocationCacheMaxEntries)),
       allowed_locations_(allowed_locations),
+      compose_append_(compose_append),
       timeouts_(timeouts),
       retry_config_(retry_config),
       additional_header_(additional_header) {}
@@ -1056,9 +1152,10 @@ Status GcsFileSystem::NewWritableFile(const string& fname,
                                       std::unique_ptr<WritableFile>* result) {
   string bucket, object;
   TF_RETURN_IF_ERROR(ParseGcsPath(fname, false, &bucket, &object));
-  result->reset(new GcsWritableFile(bucket, object, this, &timeouts_,
-                                    [this, fname]() { ClearFileCaches(fname); },
-                                    retry_config_));
+  result->reset(new GcsWritableFile(
+      bucket, object, this, &timeouts_,
+      [this, fname]() { ClearFileCaches(fname); }, retry_config_,
+      compose_append_));
   return Status::OK();
 }
 
@@ -1098,7 +1195,8 @@ Status GcsFileSystem::NewAppendableFile(const string& fname,
   TF_RETURN_IF_ERROR(ParseGcsPath(fname, false, &bucket, &object));
   result->reset(new GcsWritableFile(
       bucket, object, this, old_content_filename, &timeouts_,
-      [this, fname]() { ClearFileCaches(fname); }, retry_config_));
+      [this, fname]() { ClearFileCaches(fname); }, retry_config_,
+      compose_append_));
   return Status::OK();
 }
 
@@ -1629,6 +1727,7 @@ Status GcsFileSystem::RenameFile(const string& src, const string& target) {
 
 // Uses a GCS API command to copy the object and then deletes the old one.
 Status GcsFileSystem::RenameObject(const string& src, const string& target) {
+  VLOG(3) << "RenameObject: started gs://" << src << " to " << target;
   string src_bucket, src_object, target_bucket, target_object;
   TF_RETURN_IF_ERROR(ParseGcsPath(src, false, &src_bucket, &src_object));
   TF_RETURN_IF_ERROR(
@@ -1664,6 +1763,7 @@ Status GcsFileSystem::RenameObject(const string& src, const string& target) {
         "locations or storage classes is not supported.");
   }
 
+  VLOG(3) << "RenameObject: finished from: gs://" << src << " to " << target;
   // In case the delete API call failed, but the deletion actually happened
   // on the server side, we can't just retry the whole RenameFile operation
   // because the source object is already gone.
diff --git a/tensorflow/core/platform/cloud/gcs_file_system.h b/tensorflow/core/platform/cloud/gcs_file_system.h
index d1d8aed54d4..f066cc31eb4 100644
--- a/tensorflow/core/platform/cloud/gcs_file_system.h
+++ b/tensorflow/core/platform/cloud/gcs_file_system.h
@@ -122,7 +122,8 @@ class GcsFileSystem : public FileSystem {
                 size_t matching_paths_cache_max_entries,
                 RetryConfig retry_config, TimeoutConfig timeouts,
                 const std::unordered_set<string>& allowed_locations,
-                std::pair<const string, const string>* additional_header);
+                std::pair<const string, const string>* additional_header,
+                bool compose_append);
 
   Status NewRandomAccessFile(
       const string& fname, std::unique_ptr<RandomAccessFile>* result) override;
@@ -187,6 +188,8 @@ class GcsFileSystem : public FileSystem {
   std::unordered_set<string> allowed_locations() const {
     return allowed_locations_;
   }
+
+  bool compose_append() const { return compose_append_; }
   string additional_header_name() const {
     return additional_header_ ? additional_header_->first : "";
   }
@@ -373,6 +376,7 @@ class GcsFileSystem : public FileSystem {
   using BucketLocationCache = ExpiringLRUCache<string>;
   std::unique_ptr<BucketLocationCache> bucket_location_cache_;
   std::unordered_set<string> allowed_locations_;
+  bool compose_append_;
 
   TimeoutConfig timeouts_;
 
diff --git a/tensorflow/core/platform/cloud/gcs_file_system_test.cc b/tensorflow/core/platform/cloud/gcs_file_system_test.cc
index 544ddc32043..6892bd7cc26 100644
--- a/tensorflow/core/platform/cloud/gcs_file_system_test.cc
+++ b/tensorflow/core/platform/cloud/gcs_file_system_test.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/platform/cloud/http_request_fake.h"
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/str_util.h"
+#include "tensorflow/core/platform/strcat.h"
 #include "tensorflow/core/platform/test.h"
 
 // Undef DeleteFile macro defined in wndows.h.
@@ -73,16 +74,16 @@ TEST(GcsFileSystemTest, NewRandomAccessFile_NoBlockCache) {
            "Range: 6-11\n"
            "Timeouts: 5 1 20\n",
            "6789")});
-  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-                   std::unique_ptr<HttpRequest::Factory>(
-                       new FakeHttpRequestFactory(&requests)),
-                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
-                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
-                   0 /* stat cache max age */, 0 /* stat cache max entries */,
-                   0 /* matching paths cache max age */,
-                   0 /* matching paths cache max entries */, kTestRetryConfig,
-                   kTestTimeoutConfig, *kAllowedLocationsDefault,
-                   nullptr /* gcs additional header */);
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
+      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
+      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
+      kTestTimeoutConfig, *kAllowedLocationsDefault,
+      nullptr /* gcs additional header */, false /* compose append */);
 
   std::unique_ptr<RandomAccessFile> file;
   TF_EXPECT_OK(fs.NewRandomAccessFile("gs://bucket/random_access.txt", &file));
@@ -129,7 +130,7 @@ TEST(GcsFileSystemTest, NewRandomAccessFile_Buffered) {
       0 /* stat cache max entries */, 0 /* matching paths cache max age */,
       0 /* matching paths cache max entries */, kTestRetryConfig,
       kTestTimeoutConfig, *kAllowedLocationsDefault,
-      nullptr /* gcs additional header */);
+      nullptr /* gcs additional header */, false /* compose append */);
 
   std::unique_ptr<RandomAccessFile> file;
   TF_EXPECT_OK(fs.NewRandomAccessFile("gs://bucket/random_access.txt", &file));
@@ -177,7 +178,7 @@ TEST(GcsFileSystemTest, NewRandomAccessFile_Buffered_Errors) {
       0 /* stat cache max entries */, 0 /* matching paths cache max age */,
       0 /* matching paths cache max entries */, kTestRetryConfig,
       kTestTimeoutConfig, *kAllowedLocationsDefault,
-      nullptr /* gcs additional header */);
+      nullptr /* gcs additional header */, false /* compose append */);
 
   std::unique_ptr<RandomAccessFile> file;
   TF_EXPECT_OK(fs.NewRandomAccessFile("gs://bucket/random_access.txt", &file));
@@ -224,7 +225,7 @@ TEST(GcsFileSystemTest, NewRandomAccessFile_Buffered_ReadAtEOF) {
       0 /* stat cache max entries */, 0 /* matching paths cache max age */,
       0 /* matching paths cache max entries */, kTestRetryConfig,
       kTestTimeoutConfig, *kAllowedLocationsDefault,
-      nullptr /* gcs additional header */);
+      nullptr /* gcs additional header */, false /* compose append */);
 
   std::unique_ptr<RandomAccessFile> file;
   TF_EXPECT_OK(fs.NewRandomAccessFile("gs://bucket/random_access.txt", &file));
@@ -265,7 +266,7 @@ TEST(GcsFileSystemTest, NewRandomAccessFile_Buffered_CachedOutOfRange) {
       0 /* stat cache max entries */, 0 /* matching paths cache max age */,
       0 /* matching paths cache max entries */, kTestRetryConfig,
       kTestTimeoutConfig, *kAllowedLocationsDefault,
-      nullptr /* gcs additional header */);
+      nullptr /* gcs additional header */, false /* compose append */);
 
   std::unique_ptr<RandomAccessFile> file;
   TF_EXPECT_OK(fs.NewRandomAccessFile("gs://bucket/random_access.txt", &file));
@@ -316,7 +317,7 @@ TEST(GcsFileSystemTest, NewRandomAccessFile_Buffered_CachedNotSequential) {
       0 /* stat cache max entries */, 0 /* matching paths cache max age */,
       0 /* matching paths cache max entries */, kTestRetryConfig,
       kTestTimeoutConfig, *kAllowedLocationsDefault,
-      nullptr /* gcs additional header */);
+      nullptr /* gcs additional header */, false /* compose append */);
 
   std::unique_ptr<RandomAccessFile> file;
   TF_EXPECT_OK(fs.NewRandomAccessFile("gs://bucket/random_access.txt", &file));
@@ -357,7 +358,7 @@ TEST(GcsFileSystemTest, NewRandomAccessFile_Buffered_Growing) {
       0 /* stat cache max entries */, 0 /* matching paths cache max age */,
       0 /* matching paths cache max entries */, kTestRetryConfig,
       kTestTimeoutConfig, *kAllowedLocationsDefault,
-      nullptr /* gcs additional header */);
+      nullptr /* gcs additional header */, false /* compose append */);
 
   std::unique_ptr<RandomAccessFile> file;
   TF_EXPECT_OK(fs.NewRandomAccessFile("gs://bucket/random_access.txt", &file));
@@ -404,7 +405,7 @@ TEST(GcsFileSystemTest, NewRandomAccessFile_Buffered_ReadBackwards) {
       0 /* stat cache max entries */, 0 /* matching paths cache max age */,
       0 /* matching paths cache max entries */, kTestRetryConfig,
       kTestTimeoutConfig, *kAllowedLocationsDefault,
-      nullptr /* gcs additional header */);
+      nullptr /* gcs additional header */, false /* compose append */);
 
   std::unique_ptr<RandomAccessFile> file;
   TF_EXPECT_OK(fs.NewRandomAccessFile("gs://bucket/random_access.txt", &file));
@@ -437,16 +438,16 @@ TEST(GcsFileSystemTest,
             "location":"US-EAST1"
           })")});
 
-  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-                   std::unique_ptr<HttpRequest::Factory>(
-                       new FakeHttpRequestFactory(&requests)),
-                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
-                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
-                   0 /* stat cache max age */, 0 /* stat cache max entries */,
-                   0 /* matching paths cache max age */,
-                   0 /* matching paths cache max entries */, kTestRetryConfig,
-                   kTestTimeoutConfig, *kAllowedLocationsAuto,
-                   nullptr /* gcs additional header */);
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
+      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
+      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
+      kTestTimeoutConfig, *kAllowedLocationsAuto,
+      nullptr /* gcs additional header */, false /* compose append */);
 
   std::unique_ptr<RandomAccessFile> file;
   TF_EXPECT_OK(fs.NewRandomAccessFile("gs://bucket/random_access.txt", &file));
@@ -479,16 +480,16 @@ TEST(GcsFileSystemTest, NewRandomAccessFile_WithLocationConstraintCaching) {
             "location":"US-EAST1"
           })")});
 
-  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-                   std::unique_ptr<HttpRequest::Factory>(
-                       new FakeHttpRequestFactory(&requests)),
-                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
-                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
-                   0 /* stat cache max age */, 0 /* stat cache max entries */,
-                   0 /* matching paths cache max age */,
-                   0 /* matching paths cache max entries */, kTestRetryConfig,
-                   kTestTimeoutConfig, *kAllowedLocationsAuto,
-                   nullptr /* gcs additional header */);
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
+      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
+      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
+      kTestTimeoutConfig, *kAllowedLocationsAuto,
+      nullptr /* gcs additional header */, false /* compose append */);
 
   std::unique_ptr<RandomAccessFile> file;
 
@@ -520,16 +521,16 @@ TEST(GcsFileSystemTest,
             "location":"BARFOO"
           })")});
 
-  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-                   std::unique_ptr<HttpRequest::Factory>(
-                       new FakeHttpRequestFactory(&requests)),
-                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
-                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
-                   0 /* stat cache max age */, 0 /* stat cache max entries */,
-                   0 /* matching paths cache max age */,
-                   0 /* matching paths cache max entries */, kTestRetryConfig,
-                   kTestTimeoutConfig, *kAllowedLocationsAuto,
-                   nullptr /* gcs additional header */);
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
+      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
+      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
+      kTestTimeoutConfig, *kAllowedLocationsAuto,
+      nullptr /* gcs additional header */, false /* compose append */);
 
   std::unique_ptr<RandomAccessFile> file;
   EXPECT_EQ(tensorflow::errors::FailedPrecondition(
@@ -552,16 +553,16 @@ TEST(GcsFileSystemTest, NewRandomAccessFile_NoBlockCache_DifferentN) {
            "Range: 3-12\n"
            "Timeouts: 5 1 20\n",
            "3456789")});
-  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-                   std::unique_ptr<HttpRequest::Factory>(
-                       new FakeHttpRequestFactory(&requests)),
-                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
-                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
-                   0 /* stat cache max age */, 0 /* stat cache max entries */,
-                   0 /* matching paths cache max age */,
-                   0 /* matching paths cache max entries */, kTestRetryConfig,
-                   kTestTimeoutConfig, *kAllowedLocationsDefault,
-                   nullptr /* gcs additional header */);
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
+      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
+      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
+      kTestTimeoutConfig, *kAllowedLocationsDefault,
+      nullptr /* gcs additional header */, false /* compose append */);
 
   std::unique_ptr<RandomAccessFile> file;
   TF_EXPECT_OK(fs.NewRandomAccessFile("gs://bucket/random_access.txt", &file));
@@ -621,7 +622,7 @@ TEST(GcsFileSystemTest, NewRandomAccessFile_WithBlockCache) {
       0 /* stat cache max entries */, 0 /* matching paths cache max age */,
       0 /* matching paths cache max entries */, kTestRetryConfig,
       kTestTimeoutConfig, *kAllowedLocationsDefault,
-      nullptr /* gcs additional header */);
+      nullptr /* gcs additional header */, false /* compose append */);
 
   char scratch[100];
   StringPiece result;
@@ -710,7 +711,7 @@ TEST(GcsFileSystemTest, NewRandomAccessFile_WithBlockCache_Flush) {
       0 /* stat cache max entries */, 0 /* matching paths cache max age */,
       0 /* matching paths cache max entries */, kTestRetryConfig,
       kTestTimeoutConfig, *kAllowedLocationsDefault,
-      nullptr /* gcs additional header */);
+      nullptr /* gcs additional header */, false /* compose append */);
 
   char scratch[100];
   StringPiece result;
@@ -750,17 +751,17 @@ TEST(GcsFileSystemTest, NewRandomAccessFile_WithBlockCache_MaxStaleness) {
                            "Range: 8-15\n"
                            "Timeouts: 5 1 20\n",
                            "89abcdef")});
-  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-                   std::unique_ptr<HttpRequest::Factory>(
-                       new FakeHttpRequestFactory(&requests)),
-                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
-                   8 /* block size */, 16 /* max bytes */,
-                   3600 /* max staleness */, 3600 /* stat cache max age */,
-                   0 /* stat cache max entries */,
-                   0 /* matching paths cache max age */,
-                   0 /* matching paths cache max entries */, kTestRetryConfig,
-                   kTestTimeoutConfig, *kAllowedLocationsDefault,
-                   nullptr /* gcs additional header */);
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 8 /* block size */,
+      16 /* max bytes */, 3600 /* max staleness */,
+      3600 /* stat cache max age */, 0 /* stat cache max entries */,
+      0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
+      kTestTimeoutConfig, *kAllowedLocationsDefault,
+      nullptr /* gcs additional header */, false /* compose append */);
   char scratch[100];
   StringPiece result;
   // There should only be two HTTP requests issued to GCS even though we iterate
@@ -830,7 +831,7 @@ TEST(GcsFileSystemTest,
       0 /* stat cache max entries */, 0 /* matching paths cache max age */,
       0 /* matching paths cache max entries */, kTestRetryConfig,
       kTestTimeoutConfig, *kAllowedLocationsDefault,
-      nullptr /* gcs additional header */);
+      nullptr /* gcs additional header */, false /* compose append */);
 
   std::unique_ptr<RandomAccessFile> file;
   TF_EXPECT_OK(fs.NewRandomAccessFile("gs://bucket/random_access.txt", &file));
@@ -849,17 +850,17 @@ TEST(GcsFileSystemTest,
 
 TEST(GcsFileSystemTest, NewRandomAccessFile_NoObjectName) {
   std::vector<HttpRequest*> requests;
-  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-                   std::unique_ptr<HttpRequest::Factory>(
-                       new FakeHttpRequestFactory(&requests)),
-                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
-                   0 /* read ahead bytes */, 0 /* max bytes */,
-                   0 /* max staleness */, 0 /* stat cache max age */,
-                   0 /* stat cache max entries */,
-                   0 /* matching paths cache max age */,
-                   0 /* matching paths cache max entries */, kTestRetryConfig,
-                   kTestTimeoutConfig, *kAllowedLocationsDefault,
-                   nullptr /* gcs additional header */);
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
+      0 /* read ahead bytes */, 0 /* max bytes */, 0 /* max staleness */,
+      0 /* stat cache max age */, 0 /* stat cache max entries */,
+      0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
+      kTestTimeoutConfig, *kAllowedLocationsDefault,
+      nullptr /* gcs additional header */, false /* compose append */);
 
   std::unique_ptr<RandomAccessFile> file;
   EXPECT_EQ(errors::Code::INVALID_ARGUMENT,
@@ -883,16 +884,16 @@ TEST(GcsFileSystemTest, NewRandomAccessFile_InconsistentRead) {
            "012")});
 
   // Set stat_cache_max_age to 1000s so that StatCache could work.
-  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-                   std::unique_ptr<HttpRequest::Factory>(
-                       new FakeHttpRequestFactory(&requests)),
-                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
-                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
-                   1e3 /* stat cache max age */, 0 /* stat cache max entries */,
-                   0 /* matching paths cache max age */,
-                   0 /* matching paths cache max entries */, kTestRetryConfig,
-                   kTestTimeoutConfig, *kAllowedLocationsDefault,
-                   nullptr /* gcs additional header */);
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
+      0 /* max bytes */, 0 /* max staleness */, 1e3 /* stat cache max age */,
+      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
+      kTestTimeoutConfig, *kAllowedLocationsDefault,
+      nullptr /* gcs additional header */, false /* compose append */);
 
   // Stat the file first so that the file stats are cached.
   FileStatistics stat;
@@ -959,7 +960,7 @@ TEST(GcsFileSystemTest, NewWritableFile) {
       0 /* stat cache max entries */, 0 /* matching paths cache max age */,
       0 /* matching paths cache max entries */, kTestRetryConfig,
       kTestTimeoutConfig, *kAllowedLocationsDefault,
-      nullptr /* gcs additional header */);
+      nullptr /* gcs additional header */, false /* compose append */);
 
   // Read from the file first, to fill the block cache.
   std::unique_ptr<RandomAccessFile> rfile;
@@ -1042,16 +1043,16 @@ TEST(GcsFileSystemTest, NewWritableFile_ResumeUploadSucceeds) {
                            "Timeouts: 5 1 30\n"
                            "Put body: t2\n",
                            "")});
-  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-                   std::unique_ptr<HttpRequest::Factory>(
-                       new FakeHttpRequestFactory(&requests)),
-                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
-                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
-                   0 /* stat cache max age */, 0 /* stat cache max entries */,
-                   0 /* matching paths cache max age */,
-                   0 /* matching paths cache max entries */, kTestRetryConfig,
-                   kTestTimeoutConfig, *kAllowedLocationsDefault,
-                   nullptr /* gcs additional header */);
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
+      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
+      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
+      kTestTimeoutConfig, *kAllowedLocationsDefault,
+      nullptr /* gcs additional header */, false /* compose append */);
 
   std::unique_ptr<WritableFile> file;
   TF_EXPECT_OK(fs.NewWritableFile("gs://bucket/path/writeable.txt", &file));
@@ -1112,17 +1113,17 @@ TEST(GcsFileSystemTest, NewWritableFile_ResumeUploadSucceedsOnGetStatus) {
            "Range: 0-7\n"
            "Timeouts: 5 1 20\n",
            "01234567")});
-  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-                   std::unique_ptr<HttpRequest::Factory>(
-                       new FakeHttpRequestFactory(&requests)),
-                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
-                   8 /* block size */, 8 /* max bytes */,
-                   3600 /* max staleness */, 3600 /* stat cache max age */,
-                   0 /* stat cache max entries */,
-                   0 /* matching paths cache max age */,
-                   0 /* matching paths cache max entries */, kTestRetryConfig,
-                   kTestTimeoutConfig, *kAllowedLocationsDefault,
-                   nullptr /* gcs additional header */);
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 8 /* block size */,
+      8 /* max bytes */, 3600 /* max staleness */,
+      3600 /* stat cache max age */, 0 /* stat cache max entries */,
+      0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
+      kTestTimeoutConfig, *kAllowedLocationsDefault,
+      nullptr /* gcs additional header */, false /* compose append */);
   // Pull the file's first block into the cache. This will trigger the first
   // HTTP request to GCS.
   std::unique_ptr<RandomAccessFile> rfile;
@@ -1208,7 +1209,8 @@ TEST(GcsFileSystemTest, NewWritableFile_ResumeUploadAllAttemptsFail) {
       0 /* stat cache max entries */, 0 /* matching paths cache max age */,
       0 /* matching paths cache max entries */,
       RetryConfig(2 /* .init_delay_time_us */), kTestTimeoutConfig,
-      *kAllowedLocationsDefault, nullptr /* gcs additional header */);
+      *kAllowedLocationsDefault, nullptr /* gcs additional header */,
+      false /* compose append */);
 
   std::unique_ptr<WritableFile> file;
   TF_EXPECT_OK(fs.NewWritableFile("gs://bucket/path/writeable.txt", &file));
@@ -1262,16 +1264,16 @@ TEST(GcsFileSystemTest, NewWritableFile_UploadReturns410) {
                            "Timeouts: 5 1 30\n"
                            "Put body: content1,content2\n",
                            "")});
-  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-                   std::unique_ptr<HttpRequest::Factory>(
-                       new FakeHttpRequestFactory(&requests)),
-                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
-                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
-                   0 /* stat cache max age */, 0 /* stat cache max entries */,
-                   0 /* matching paths cache max age */,
-                   0 /* matching paths cache max entries */, kTestRetryConfig,
-                   kTestTimeoutConfig, *kAllowedLocationsDefault,
-                   nullptr /* gcs additional header */);
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
+      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
+      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
+      kTestTimeoutConfig, *kAllowedLocationsDefault,
+      nullptr /* gcs additional header */, false /* compose append */);
 
   {
     std::unique_ptr<WritableFile> file;
@@ -1302,16 +1304,16 @@ TEST(GcsFileSystemTest, NewWritableFile_UploadReturns410) {
 
 TEST(GcsFileSystemTest, NewWritableFile_NoObjectName) {
   std::vector<HttpRequest*> requests;
-  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-                   std::unique_ptr<HttpRequest::Factory>(
-                       new FakeHttpRequestFactory(&requests)),
-                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
-                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
-                   0 /* stat cache max age */, 0 /* stat cache max entries */,
-                   0 /* matching paths cache max age */,
-                   0 /* matching paths cache max entries */, kTestRetryConfig,
-                   kTestTimeoutConfig, *kAllowedLocationsDefault,
-                   nullptr /* gcs additional header */);
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
+      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
+      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
+      kTestTimeoutConfig, *kAllowedLocationsDefault,
+      nullptr /* gcs additional header */, false /* compose append */);
 
   std::unique_ptr<WritableFile> file;
   EXPECT_EQ(errors::Code::INVALID_ARGUMENT,
@@ -1375,7 +1377,7 @@ TEST(GcsFileSystemTest, NewAppendableFile) {
       0 /* stat cache max entries */, 0 /* matching paths cache max age */,
       0 /* matching paths cache max entries */, kTestRetryConfig,
       kTestTimeoutConfig, *kAllowedLocationsDefault,
-      nullptr /* gcs additional header */);
+      nullptr /* gcs additional header */, false /* compose append */);
 
   // Create an appendable file. This should read the file from GCS, and pull its
   // contents into the block cache.
@@ -1401,16 +1403,16 @@ TEST(GcsFileSystemTest, NewAppendableFile) {
 
 TEST(GcsFileSystemTest, NewAppendableFile_NoObjectName) {
   std::vector<HttpRequest*> requests;
-  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-                   std::unique_ptr<HttpRequest::Factory>(
-                       new FakeHttpRequestFactory(&requests)),
-                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
-                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
-                   0 /* stat cache max age */, 0 /* stat cache max entries */,
-                   0 /* matching paths cache max age */,
-                   0 /* matching paths cache max entries */, kTestRetryConfig,
-                   kTestTimeoutConfig, *kAllowedLocationsDefault,
-                   nullptr /* gcs additional header */);
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
+      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
+      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
+      kTestTimeoutConfig, *kAllowedLocationsDefault,
+      nullptr /* gcs additional header */, false /* compose append */);
 
   std::unique_ptr<WritableFile> file;
   EXPECT_EQ(errors::Code::INVALID_ARGUMENT,
@@ -1435,16 +1437,16 @@ TEST(GcsFileSystemTest, NewReadOnlyMemoryRegionFromFile) {
                            "Range: 0-",
                            content.size() - 1, "\n", "Timeouts: 5 1 20\n"),
            content)});
-  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-                   std::unique_ptr<HttpRequest::Factory>(
-                       new FakeHttpRequestFactory(&requests)),
-                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
-                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
-                   0 /* stat cache max age */, 0 /* stat cache max entries */,
-                   0 /* matching paths cache max age */,
-                   0 /* matching paths cache max entries */, kTestRetryConfig,
-                   kTestTimeoutConfig, *kAllowedLocationsDefault,
-                   nullptr /* gcs additional header */);
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
+      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
+      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
+      kTestTimeoutConfig, *kAllowedLocationsDefault,
+      nullptr /* gcs additional header */, false /* compose append */);
 
   std::unique_ptr<ReadOnlyMemoryRegion> region;
   TF_EXPECT_OK(fs.NewReadOnlyMemoryRegionFromFile(
@@ -1456,16 +1458,16 @@ TEST(GcsFileSystemTest, NewReadOnlyMemoryRegionFromFile) {
 
 TEST(GcsFileSystemTest, NewReadOnlyMemoryRegionFromFile_NoObjectName) {
   std::vector<HttpRequest*> requests;
-  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-                   std::unique_ptr<HttpRequest::Factory>(
-                       new FakeHttpRequestFactory(&requests)),
-                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
-                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
-                   0 /* stat cache max age */, 0 /* stat cache max entries */,
-                   0 /* matching paths cache max age */,
-                   0 /* matching paths cache max entries */, kTestRetryConfig,
-                   kTestTimeoutConfig, *kAllowedLocationsDefault,
-                   nullptr /* gcs additional header */);
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
+      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
+      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
+      kTestTimeoutConfig, *kAllowedLocationsDefault,
+      nullptr /* gcs additional header */, false /* compose append */);
 
   std::unique_ptr<ReadOnlyMemoryRegion> region;
   EXPECT_EQ(errors::Code::INVALID_ARGUMENT,
@@ -1480,16 +1482,16 @@ TEST(GcsFileSystemTest, FileExists_YesAsObject) {
       "Timeouts: 5 1 10\n",
       strings::StrCat("{\"size\": \"1010\",\"generation\": \"1\","
                       "\"updated\": \"2016-04-29T23:15:24.896Z\"}"))});
-  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-                   std::unique_ptr<HttpRequest::Factory>(
-                       new FakeHttpRequestFactory(&requests)),
-                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
-                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
-                   0 /* stat cache max age */, 0 /* stat cache max entries */,
-                   0 /* matching paths cache max age */,
-                   0 /* matching paths cache max entries */, kTestRetryConfig,
-                   kTestTimeoutConfig, *kAllowedLocationsDefault,
-                   nullptr /* gcs additional header */);
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
+      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
+      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
+      kTestTimeoutConfig, *kAllowedLocationsDefault,
+      nullptr /* gcs additional header */, false /* compose append */);
 
   TF_EXPECT_OK(fs.FileExists("gs://bucket/path/file1.txt"));
 }
@@ -1510,16 +1512,16 @@ TEST(GcsFileSystemTest, FileExists_YesAsFolder) {
            "Timeouts: 5 1 10\n",
            "{\"items\": [ "
            "  { \"name\": \"path/subfolder/\" }]}")});
-  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-                   std::unique_ptr<HttpRequest::Factory>(
-                       new FakeHttpRequestFactory(&requests)),
-                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
-                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
-                   0 /* stat cache max age */, 0 /* stat cache max entries */,
-                   0 /* matching paths cache max age */,
-                   0 /* matching paths cache max entries */, kTestRetryConfig,
-                   kTestTimeoutConfig, *kAllowedLocationsDefault,
-                   nullptr /* gcs additional header */);
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
+      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
+      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
+      kTestTimeoutConfig, *kAllowedLocationsDefault,
+      nullptr /* gcs additional header */, false /* compose append */);
 
   TF_EXPECT_OK(fs.FileExists("gs://bucket/path/subfolder"));
 }
@@ -1536,16 +1538,16 @@ TEST(GcsFileSystemTest, FileExists_YesAsBucket) {
            "Auth Token: fake_token\n"
            "Timeouts: 5 1 10\n",
            "{\"size\": \"100\"}")});
-  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-                   std::unique_ptr<HttpRequest::Factory>(
-                       new FakeHttpRequestFactory(&requests)),
-                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
-                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
-                   0 /* stat cache max age */, 0 /* stat cache max entries */,
-                   0 /* matching paths cache max age */,
-                   0 /* matching paths cache max entries */, kTestRetryConfig,
-                   kTestTimeoutConfig, *kAllowedLocationsDefault,
-                   nullptr /* gcs additional header */);
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
+      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
+      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
+      kTestTimeoutConfig, *kAllowedLocationsDefault,
+      nullptr /* gcs additional header */, false /* compose append */);
 
   TF_EXPECT_OK(fs.FileExists("gs://bucket1"));
   TF_EXPECT_OK(fs.FileExists("gs://bucket1/"));
@@ -1566,16 +1568,16 @@ TEST(GcsFileSystemTest, FileExists_NotAsObjectOrFolder) {
            "Auth Token: fake_token\n"
            "Timeouts: 5 1 10\n",
            "{\"items\": []}")});
-  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-                   std::unique_ptr<HttpRequest::Factory>(
-                       new FakeHttpRequestFactory(&requests)),
-                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
-                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
-                   0 /* stat cache max age */, 0 /* stat cache max entries */,
-                   0 /* matching paths cache max age */,
-                   0 /* matching paths cache max entries */, kTestRetryConfig,
-                   kTestTimeoutConfig, *kAllowedLocationsDefault,
-                   nullptr /* gcs additional header */);
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
+      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
+      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
+      kTestTimeoutConfig, *kAllowedLocationsDefault,
+      nullptr /* gcs additional header */, false /* compose append */);
 
   EXPECT_EQ(errors::Code::NOT_FOUND,
             fs.FileExists("gs://bucket/path/file1.txt").code());
@@ -1593,16 +1595,16 @@ TEST(GcsFileSystemTest, FileExists_NotAsBucket) {
            "Auth Token: fake_token\n"
            "Timeouts: 5 1 10\n",
            "", errors::NotFound("404"), 404)});
-  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-                   std::unique_ptr<HttpRequest::Factory>(
-                       new FakeHttpRequestFactory(&requests)),
-                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
-                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
-                   0 /* stat cache max age */, 0 /* stat cache max entries */,
-                   0 /* matching paths cache max age */,
-                   0 /* matching paths cache max entries */, kTestRetryConfig,
-                   kTestTimeoutConfig, *kAllowedLocationsDefault,
-                   nullptr /* gcs additional header */);
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
+      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
+      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
+      kTestTimeoutConfig, *kAllowedLocationsDefault,
+      nullptr /* gcs additional header */, false /* compose append */);
   EXPECT_EQ(errors::Code::INVALID_ARGUMENT,
             fs.FileExists("gs://bucket2/").code());
   EXPECT_EQ(errors::Code::INVALID_ARGUMENT,
@@ -1641,7 +1643,7 @@ TEST(GcsFileSystemTest, FileExists_StatCache) {
       0 /* stat cache max entries */, 0 /* matching paths cache max age */,
       0 /* matching paths cache max entries */, kTestRetryConfig,
       kTestTimeoutConfig, *kAllowedLocationsDefault,
-      nullptr /* gcs additional header */);
+      nullptr /* gcs additional header */, false /* compose append */);
 
   // The stat cache will ensure that repeated lookups don't trigger additional
   // HTTP requests.
@@ -1668,7 +1670,7 @@ TEST(GcsFileSystemTest, FileExists_DirectoryMark) {
       0 /* stat cache max entries */, 0 /* matching paths cache max age */,
       0 /* matching paths cache max entries */, kTestRetryConfig,
       kTestTimeoutConfig, *kAllowedLocationsDefault,
-      nullptr /* gcs additional header */);
+      nullptr /* gcs additional header */, false /* compose append */);
 
   TF_EXPECT_OK(fs.FileExists("gs://bucket/dir/"));
   TF_EXPECT_OK(fs.IsDirectory("gs://bucket/dir/"));
@@ -1682,16 +1684,16 @@ TEST(GcsFileSystemTest, GetChildren_NoItems) {
       "Auth Token: fake_token\n"
       "Timeouts: 5 1 10\n",
       "{\"prefixes\": [\"path/subpath/\"]}")});
-  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-                   std::unique_ptr<HttpRequest::Factory>(
-                       new FakeHttpRequestFactory(&requests)),
-                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
-                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
-                   0 /* stat cache max age */, 0 /* stat cache max entries */,
-                   0 /* matching paths cache max age */,
-                   0 /* matching paths cache max entries */, kTestRetryConfig,
-                   kTestTimeoutConfig, *kAllowedLocationsDefault,
-                   nullptr /* gcs additional header */);
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
+      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
+      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
+      kTestTimeoutConfig, *kAllowedLocationsDefault,
+      nullptr /* gcs additional header */, false /* compose append */);
 
   std::vector<string> children;
   TF_EXPECT_OK(fs.GetChildren("gs://bucket/path/", &children));
@@ -1710,16 +1712,16 @@ TEST(GcsFileSystemTest, GetChildren_ThreeFiles) {
       "  { \"name\": \"path/file1.txt\" },"
       "  { \"name\": \"path/file3.txt\" }],"
       "\"prefixes\": [\"path/subpath/\"]}")});
-  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-                   std::unique_ptr<HttpRequest::Factory>(
-                       new FakeHttpRequestFactory(&requests)),
-                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
-                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
-                   0 /* stat cache max age */, 0 /* stat cache max entries */,
-                   0 /* matching paths cache max age */,
-                   0 /* matching paths cache max entries */, kTestRetryConfig,
-                   kTestTimeoutConfig, *kAllowedLocationsDefault,
-                   nullptr /* gcs additional header */);
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
+      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
+      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
+      kTestTimeoutConfig, *kAllowedLocationsDefault,
+      nullptr /* gcs additional header */, false /* compose append */);
 
   std::vector<string> children;
   TF_EXPECT_OK(fs.GetChildren("gs://bucket/path/", &children));
@@ -1739,16 +1741,16 @@ TEST(GcsFileSystemTest, GetChildren_SelfDirectoryMarker) {
       "  { \"name\": \"path/\" },"
       "  { \"name\": \"path/file3.txt\" }],"
       "\"prefixes\": [\"path/subpath/\"]}")});
-  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-                   std::unique_ptr<HttpRequest::Factory>(
-                       new FakeHttpRequestFactory(&requests)),
-                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
-                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
-                   0 /* stat cache max age */, 0 /* stat cache max entries */,
-                   0 /* matching paths cache max age */,
-                   0 /* matching paths cache max entries */, kTestRetryConfig,
-                   kTestTimeoutConfig, *kAllowedLocationsDefault,
-                   nullptr /* gcs additional header */);
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
+      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
+      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
+      kTestTimeoutConfig, *kAllowedLocationsDefault,
+      nullptr /* gcs additional header */, false /* compose append */);
 
   std::vector<string> children;
   TF_EXPECT_OK(fs.GetChildren("gs://bucket/path/", &children));
@@ -1767,16 +1769,16 @@ TEST(GcsFileSystemTest, GetChildren_ThreeFiles_NoSlash) {
       "  { \"name\": \"path/file1.txt\" },"
       "  { \"name\": \"path/file3.txt\" }],"
       "\"prefixes\": [\"path/subpath/\"]}")});
-  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-                   std::unique_ptr<HttpRequest::Factory>(
-                       new FakeHttpRequestFactory(&requests)),
-                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
-                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
-                   0 /* stat cache max age */, 0 /* stat cache max entries */,
-                   0 /* matching paths cache max age */,
-                   0 /* matching paths cache max entries */, kTestRetryConfig,
-                   kTestTimeoutConfig, *kAllowedLocationsDefault,
-                   nullptr /* gcs additional header */);
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
+      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
+      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
+      kTestTimeoutConfig, *kAllowedLocationsDefault,
+      nullptr /* gcs additional header */, false /* compose append */);
 
   std::vector<string> children;
   TF_EXPECT_OK(fs.GetChildren("gs://bucket/path", &children));
@@ -1792,16 +1794,16 @@ TEST(GcsFileSystemTest, GetChildren_Root) {
       "Auth Token: fake_token\n"
       "Timeouts: 5 1 10\n",
       "{}")});
-  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-                   std::unique_ptr<HttpRequest::Factory>(
-                       new FakeHttpRequestFactory(&requests)),
-                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
-                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
-                   0 /* stat cache max age */, 0 /* stat cache max entries */,
-                   0 /* matching paths cache max age */,
-                   0 /* matching paths cache max entries */, kTestRetryConfig,
-                   kTestTimeoutConfig, *kAllowedLocationsDefault,
-                   nullptr /* gcs additional header */);
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
+      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
+      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
+      kTestTimeoutConfig, *kAllowedLocationsDefault,
+      nullptr /* gcs additional header */, false /* compose append */);
 
   std::vector<string> children;
   TF_EXPECT_OK(fs.GetChildren("gs://bucket-a-b-c", &children));
@@ -1817,16 +1819,16 @@ TEST(GcsFileSystemTest, GetChildren_Empty) {
       "Auth Token: fake_token\n"
       "Timeouts: 5 1 10\n",
       "{}")});
-  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-                   std::unique_ptr<HttpRequest::Factory>(
-                       new FakeHttpRequestFactory(&requests)),
-                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
-                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
-                   0 /* stat cache max age */, 0 /* stat cache max entries */,
-                   0 /* matching paths cache max age */,
-                   0 /* matching paths cache max entries */, kTestRetryConfig,
-                   kTestTimeoutConfig, *kAllowedLocationsDefault,
-                   nullptr /* gcs additional header */);
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
+      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
+      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
+      kTestTimeoutConfig, *kAllowedLocationsDefault,
+      nullptr /* gcs additional header */, false /* compose append */);
 
   std::vector<string> children;
   TF_EXPECT_OK(fs.GetChildren("gs://bucket/path/", &children));
@@ -1858,16 +1860,16 @@ TEST(GcsFileSystemTest, GetChildren_Pagination) {
            "  { \"name\": \"path/file4.txt\" },"
            "  { \"name\": \"path/file5.txt\" }]}")});
 
-  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-                   std::unique_ptr<HttpRequest::Factory>(
-                       new FakeHttpRequestFactory(&requests)),
-                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
-                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
-                   0 /* stat cache max age */, 0 /* stat cache max entries */,
-                   0 /* matching paths cache max age */,
-                   0 /* matching paths cache max entries */, kTestRetryConfig,
-                   kTestTimeoutConfig, *kAllowedLocationsDefault,
-                   nullptr /* gcs additional header */);
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
+      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
+      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
+      kTestTimeoutConfig, *kAllowedLocationsDefault,
+      nullptr /* gcs additional header */, false /* compose append */);
 
   std::vector<string> children;
   TF_EXPECT_OK(fs.GetChildren("gs://bucket/path", &children));
@@ -1885,16 +1887,16 @@ TEST(GcsFileSystemTest, GetMatchingPaths_NoWildcard) {
       "Timeouts: 5 1 10\n",
       "{\"items\": [ "
       "  { \"name\": \"path/subpath/file2.txt\" }]}")});
-  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-                   std::unique_ptr<HttpRequest::Factory>(
-                       new FakeHttpRequestFactory(&requests)),
-                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
-                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
-                   0 /* stat cache max age */, 0 /* stat cache max entries */,
-                   0 /* matching paths cache max age */,
-                   0 /* matching paths cache max entries */, kTestRetryConfig,
-                   kTestTimeoutConfig, *kAllowedLocationsDefault,
-                   nullptr /* gcs additional header */);
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
+      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
+      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
+      kTestTimeoutConfig, *kAllowedLocationsDefault,
+      nullptr /* gcs additional header */, false /* compose append */);
 
   std::vector<string> result;
   TF_EXPECT_OK(
@@ -1913,16 +1915,16 @@ TEST(GcsFileSystemTest, GetMatchingPaths_BucketAndWildcard) {
       "  { \"name\": \"path/file1.txt\" },"
       "  { \"name\": \"path/subpath/file2.txt\" },"
       "  { \"name\": \"path/file3.txt\" }]}")});
-  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-                   std::unique_ptr<HttpRequest::Factory>(
-                       new FakeHttpRequestFactory(&requests)),
-                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
-                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
-                   0 /* stat cache max age */, 0 /* stat cache max entries */,
-                   0 /* matching paths cache max age */,
-                   0 /* matching paths cache max entries */, kTestRetryConfig,
-                   kTestTimeoutConfig, *kAllowedLocationsDefault,
-                   nullptr /* gcs additional header */);
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
+      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
+      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
+      kTestTimeoutConfig, *kAllowedLocationsDefault,
+      nullptr /* gcs additional header */, false /* compose append */);
 
   std::vector<string> result;
   TF_EXPECT_OK(fs.GetMatchingPaths("gs://bucket/*/*", &result));
@@ -1942,16 +1944,16 @@ TEST(GcsFileSystemTest, GetMatchingPaths_FolderAndWildcard_Matches) {
       "  { \"name\": \"path/file1.txt\" },"
       "  { \"name\": \"path/subpath/file2.txt\" },"
       "  { \"name\": \"path/file3.txt\" }]}")});
-  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-                   std::unique_ptr<HttpRequest::Factory>(
-                       new FakeHttpRequestFactory(&requests)),
-                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
-                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
-                   0 /* stat cache max age */, 0 /* stat cache max entries */,
-                   0 /* matching paths cache max age */,
-                   0 /* matching paths cache max entries */, kTestRetryConfig,
-                   kTestTimeoutConfig, *kAllowedLocationsDefault,
-                   nullptr /* gcs additional header */);
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
+      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
+      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
+      kTestTimeoutConfig, *kAllowedLocationsDefault,
+      nullptr /* gcs additional header */, false /* compose append */);
 
   std::vector<string> result;
   TF_EXPECT_OK(fs.GetMatchingPaths("gs://bucket/path/*/file2.txt", &result));
@@ -1968,16 +1970,16 @@ TEST(GcsFileSystemTest, GetMatchingPaths_SelfDirectoryMarker) {
       "{\"items\": [ "
       "  { \"name\": \"path/\" },"
       "  { \"name\": \"path/file3.txt\" }]}")});
-  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-                   std::unique_ptr<HttpRequest::Factory>(
-                       new FakeHttpRequestFactory(&requests)),
-                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
-                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
-                   0 /* stat cache max age */, 0 /* stat cache max entries */,
-                   0 /* matching paths cache max age */,
-                   0 /* matching paths cache max entries */, kTestRetryConfig,
-                   kTestTimeoutConfig, *kAllowedLocationsDefault,
-                   nullptr /* gcs additional header */);
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
+      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
+      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
+      kTestTimeoutConfig, *kAllowedLocationsDefault,
+      nullptr /* gcs additional header */, false /* compose append */);
 
   std::vector<string> result;
   TF_EXPECT_OK(fs.GetMatchingPaths("gs://bucket/path/*", &result));
@@ -1993,16 +1995,16 @@ TEST(GcsFileSystemTest, GetMatchingPaths_SlashInObjectName) {
       "{\"items\": [ "
       "  { \"name\": \"path/\" },"
       "  { \"name\": \"path//foo.txt\" }]}")});
-  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-                   std::unique_ptr<HttpRequest::Factory>(
-                       new FakeHttpRequestFactory(&requests)),
-                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
-                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
-                   0 /* stat cache max age */, 0 /* stat cache max entries */,
-                   0 /* matching paths cache max age */,
-                   0 /* matching paths cache max entries */, kTestRetryConfig,
-                   kTestTimeoutConfig, *kAllowedLocationsDefault,
-                   nullptr /* gcs additional header */);
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
+      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
+      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
+      kTestTimeoutConfig, *kAllowedLocationsDefault,
+      nullptr /* gcs additional header */, false /* compose append */);
 
   std::vector<string> result;
   TF_EXPECT_OK(fs.GetMatchingPaths("gs://bucket/path/*", &result));
@@ -2018,16 +2020,16 @@ TEST(GcsFileSystemTest, GetMatchingPaths_SlashInObjectNameEscaped) {
       "{\"items\": [ "
       "  { \"name\": \"path/\" },"
       "  { \"name\": \"path//foo.txt\" }]}")});
-  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-                   std::unique_ptr<HttpRequest::Factory>(
-                       new FakeHttpRequestFactory(&requests)),
-                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
-                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
-                   0 /* stat cache max age */, 0 /* stat cache max entries */,
-                   0 /* matching paths cache max age */,
-                   0 /* matching paths cache max entries */, kTestRetryConfig,
-                   kTestTimeoutConfig, *kAllowedLocationsDefault,
-                   nullptr /* gcs additional header */);
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
+      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
+      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
+      kTestTimeoutConfig, *kAllowedLocationsDefault,
+      nullptr /* gcs additional header */, false /* compose append */);
 
   std::vector<string> result;
   TF_EXPECT_OK(fs.GetMatchingPaths("gs://bucket/path/\\/*", &result));
@@ -2044,16 +2046,16 @@ TEST(GcsFileSystemTest, GetMatchingPaths_FolderAndWildcard_NoMatches) {
       "  { \"name\": \"path/file1.txt\" },"
       "  { \"name\": \"path/subpath/file2.txt\" },"
       "  { \"name\": \"path/file3.txt\" }]}")});
-  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-                   std::unique_ptr<HttpRequest::Factory>(
-                       new FakeHttpRequestFactory(&requests)),
-                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
-                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
-                   0 /* stat cache max age */, 0 /* stat cache max entries */,
-                   0 /* matching paths cache max age */,
-                   0 /* matching paths cache max entries */, kTestRetryConfig,
-                   kTestTimeoutConfig, *kAllowedLocationsDefault,
-                   nullptr /* gcs additional header */);
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
+      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
+      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
+      kTestTimeoutConfig, *kAllowedLocationsDefault,
+      nullptr /* gcs additional header */, false /* compose append */);
 
   std::vector<string> result;
   TF_EXPECT_OK(fs.GetMatchingPaths("gs://bucket/path/*/file3.txt", &result));
@@ -2062,16 +2064,16 @@ TEST(GcsFileSystemTest, GetMatchingPaths_FolderAndWildcard_NoMatches) {
 
 TEST(GcsFileSystemTest, GetMatchingPaths_OnlyWildcard) {
   std::vector<HttpRequest*> requests;
-  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-                   std::unique_ptr<HttpRequest::Factory>(
-                       new FakeHttpRequestFactory(&requests)),
-                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
-                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
-                   0 /* stat cache max age */, 0 /* stat cache max entries */,
-                   0 /* matching paths cache max age */,
-                   0 /* matching paths cache max entries */, kTestRetryConfig,
-                   kTestTimeoutConfig, *kAllowedLocationsDefault,
-                   nullptr /* gcs additional header */);
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
+      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
+      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
+      kTestTimeoutConfig, *kAllowedLocationsDefault,
+      nullptr /* gcs additional header */, false /* compose append */);
 
   std::vector<string> result;
   EXPECT_EQ(errors::Code::INVALID_ARGUMENT,
@@ -2096,16 +2098,16 @@ TEST(GcsFileSystemTest, GetMatchingPaths_Cache) {
            "  { \"name\": \"path/file1.txt\" },"
            "  { \"name\": \"path/subpath/file2.txt\" },"
            "  { \"name\": \"path/file3.txt\" }]}")});
-  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-                   std::unique_ptr<HttpRequest::Factory>(
-                       new FakeHttpRequestFactory(&requests)),
-                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
-                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
-                   0 /* stat cache max age */, 0 /* stat cache max entries */,
-                   3600 /* matching paths cache max age */,
-                   0 /* matching paths cache max entries */, kTestRetryConfig,
-                   kTestTimeoutConfig, *kAllowedLocationsDefault,
-                   nullptr /* gcs additional header */);
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
+      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
+      0 /* stat cache max entries */, 3600 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
+      kTestTimeoutConfig, *kAllowedLocationsDefault,
+      nullptr /* gcs additional header */, false /* compose append */);
 
   // Repeated calls to fs.GetMatchingPaths on these patterns should not lead to
   // any additional HTTP requests to GCS.
@@ -2139,16 +2141,16 @@ TEST(GcsFileSystemTest, GetMatchingPaths_Cache_Flush) {
            "Timeouts: 5 1 10\n",
            "{\"items\": [ "
            "  { \"name\": \"path/subpath/file2.txt\" }]}")});
-  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-                   std::unique_ptr<HttpRequest::Factory>(
-                       new FakeHttpRequestFactory(&requests)),
-                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
-                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
-                   0 /* stat cache max age */, 0 /* stat cache max entries */,
-                   3600 /* matching paths cache max age */,
-                   0 /* matching paths cache max entries */, kTestRetryConfig,
-                   kTestTimeoutConfig, *kAllowedLocationsDefault,
-                   nullptr /* gcs additional header */);
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
+      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
+      0 /* stat cache max entries */, 3600 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
+      kTestTimeoutConfig, *kAllowedLocationsDefault,
+      nullptr /* gcs additional header */, false /* compose append */);
 
   // This loop should trigger the first HTTP request to GCS.
   for (int i = 0; i < 10; i++) {
@@ -2212,7 +2214,7 @@ TEST(GcsFileSystemTest, DeleteFile) {
       0 /* stat cache max entries */, 0 /* matching paths cache max age */,
       0 /* matching paths cache max entries */, kTestRetryConfig,
       kTestTimeoutConfig, *kAllowedLocationsDefault,
-      nullptr /* gcs additional header */);
+      nullptr /* gcs additional header */, false /* compose append */);
 
   // Do an initial read of the file to load its contents into the block cache.
   char scratch[100];
@@ -2231,16 +2233,16 @@ TEST(GcsFileSystemTest, DeleteFile) {
 
 TEST(GcsFileSystemTest, DeleteFile_NoObjectName) {
   std::vector<HttpRequest*> requests;
-  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-                   std::unique_ptr<HttpRequest::Factory>(
-                       new FakeHttpRequestFactory(&requests)),
-                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
-                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
-                   0 /* stat cache max age */, 0 /* stat cache max entries */,
-                   0 /* matching paths cache max age */,
-                   0 /* matching paths cache max entries */, kTestRetryConfig,
-                   kTestTimeoutConfig, *kAllowedLocationsDefault,
-                   nullptr /* gcs additional header */);
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
+      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
+      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
+      kTestTimeoutConfig, *kAllowedLocationsDefault,
+      nullptr /* gcs additional header */, false /* compose append */);
 
   EXPECT_EQ(errors::Code::INVALID_ARGUMENT,
             fs.DeleteFile("gs://bucket/").code());
@@ -2283,7 +2285,7 @@ TEST(GcsFileSystemTest, DeleteFile_StatCacheRemoved) {
       0 /* stat cache max entries */, 0 /* matching paths cache max age */,
       0 /* matching paths cache max entries */, kTestRetryConfig,
       kTestTimeoutConfig, *kAllowedLocationsDefault,
-      nullptr /* gcs additional header */);
+      nullptr /* gcs additional header */, false /* compose append */);
 
   // Stats the file first so the stat is cached.
   FileStatistics stat_before_deletion;
@@ -2304,16 +2306,16 @@ TEST(GcsFileSystemTest, DeleteDir_Empty) {
       "Auth Token: fake_token\n"
       "Timeouts: 5 1 10\n",
       "{}")});
-  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-                   std::unique_ptr<HttpRequest::Factory>(
-                       new FakeHttpRequestFactory(&requests)),
-                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
-                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
-                   0 /* stat cache max age */, 0 /* stat cache max entries */,
-                   0 /* matching paths cache max age */,
-                   0 /* matching paths cache max entries */, kTestRetryConfig,
-                   kTestTimeoutConfig, *kAllowedLocationsDefault,
-                   nullptr /* gcs additional header */);
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
+      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
+      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
+      kTestTimeoutConfig, *kAllowedLocationsDefault,
+      nullptr /* gcs additional header */, false /* compose append */);
 
   TF_EXPECT_OK(fs.DeleteDir("gs://bucket/path/"));
 }
@@ -2333,16 +2335,16 @@ TEST(GcsFileSystemTest, DeleteDir_OnlyDirMarkerLeft) {
                            "Timeouts: 5 1 10\n"
                            "Delete: yes\n",
                            "")});
-  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-                   std::unique_ptr<HttpRequest::Factory>(
-                       new FakeHttpRequestFactory(&requests)),
-                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
-                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
-                   0 /* stat cache max age */, 0 /* stat cache max entries */,
-                   0 /* matching paths cache max age */,
-                   0 /* matching paths cache max entries */, kTestRetryConfig,
-                   kTestTimeoutConfig, *kAllowedLocationsDefault,
-                   nullptr /* gcs additional header */);
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
+      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
+      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
+      kTestTimeoutConfig, *kAllowedLocationsDefault,
+      nullptr /* gcs additional header */, false /* compose append */);
 
   TF_EXPECT_OK(fs.DeleteDir("gs://bucket/path/"));
 }
@@ -2353,16 +2355,16 @@ TEST(GcsFileSystemTest, DeleteDir_BucketOnly) {
       "name%2CnextPageToken&maxResults=2\nAuth Token: fake_token\n"
       "Timeouts: 5 1 10\n",
       "{}")});
-  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-                   std::unique_ptr<HttpRequest::Factory>(
-                       new FakeHttpRequestFactory(&requests)),
-                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
-                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
-                   0 /* stat cache max age */, 0 /* stat cache max entries */,
-                   0 /* matching paths cache max age */,
-                   0 /* matching paths cache max entries */, kTestRetryConfig,
-                   kTestTimeoutConfig, *kAllowedLocationsDefault,
-                   nullptr /* gcs additional header */);
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
+      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
+      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
+      kTestTimeoutConfig, *kAllowedLocationsDefault,
+      nullptr /* gcs additional header */, false /* compose append */);
 
   TF_EXPECT_OK(fs.DeleteDir("gs://bucket"));
 }
@@ -2375,16 +2377,16 @@ TEST(GcsFileSystemTest, DeleteDir_NonEmpty) {
       "Timeouts: 5 1 10\n",
       "{\"items\": [ "
       "  { \"name\": \"path/file1.txt\" }]}")});
-  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-                   std::unique_ptr<HttpRequest::Factory>(
-                       new FakeHttpRequestFactory(&requests)),
-                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
-                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
-                   0 /* stat cache max age */, 0 /* stat cache max entries */,
-                   0 /* matching paths cache max age */,
-                   0 /* matching paths cache max entries */, kTestRetryConfig,
-                   kTestTimeoutConfig, *kAllowedLocationsDefault,
-                   nullptr /* gcs additional header */);
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
+      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
+      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
+      kTestTimeoutConfig, *kAllowedLocationsDefault,
+      nullptr /* gcs additional header */, false /* compose append */);
 
   EXPECT_EQ(error::Code::FAILED_PRECONDITION,
             fs.DeleteDir("gs://bucket/path/").code());
@@ -2398,16 +2400,16 @@ TEST(GcsFileSystemTest, GetFileSize) {
       "Timeouts: 5 1 10\n",
       strings::StrCat("{\"size\": \"1010\",\"generation\": \"1\","
                       "\"updated\": \"2016-04-29T23:15:24.896Z\"}"))});
-  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-                   std::unique_ptr<HttpRequest::Factory>(
-                       new FakeHttpRequestFactory(&requests)),
-                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
-                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
-                   0 /* stat cache max age */, 0 /* stat cache max entries */,
-                   0 /* matching paths cache max age */,
-                   0 /* matching paths cache max entries */, kTestRetryConfig,
-                   kTestTimeoutConfig, *kAllowedLocationsDefault,
-                   nullptr /* gcs additional header */);
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
+      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
+      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
+      kTestTimeoutConfig, *kAllowedLocationsDefault,
+      nullptr /* gcs additional header */, false /* compose append */);
 
   uint64 size;
   TF_EXPECT_OK(fs.GetFileSize("gs://bucket/file.txt", &size));
@@ -2416,16 +2418,16 @@ TEST(GcsFileSystemTest, GetFileSize) {
 
 TEST(GcsFileSystemTest, GetFileSize_NoObjectName) {
   std::vector<HttpRequest*> requests;
-  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-                   std::unique_ptr<HttpRequest::Factory>(
-                       new FakeHttpRequestFactory(&requests)),
-                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
-                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
-                   0 /* stat cache max age */, 0 /* stat cache max entries */,
-                   0 /* matching paths cache max age */,
-                   0 /* matching paths cache max entries */, kTestRetryConfig,
-                   kTestTimeoutConfig, *kAllowedLocationsDefault,
-                   nullptr /* gcs additional header */);
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
+      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
+      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
+      kTestTimeoutConfig, *kAllowedLocationsDefault,
+      nullptr /* gcs additional header */, false /* compose append */);
 
   uint64 size;
   EXPECT_EQ(errors::Code::INVALID_ARGUMENT,
@@ -2502,16 +2504,16 @@ TEST(GcsFileSystemTest, RenameFile_Folder) {
            "Timeouts: 5 1 10\n"
            "Delete: yes\n",
            "")});
-  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-                   std::unique_ptr<HttpRequest::Factory>(
-                       new FakeHttpRequestFactory(&requests)),
-                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
-                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
-                   0 /* stat cache max age */, 0 /* stat cache max entries */,
-                   0 /* matching paths cache max age */,
-                   0 /* matching paths cache max entries */, kTestRetryConfig,
-                   kTestTimeoutConfig, *kAllowedLocationsDefault,
-                   nullptr /* gcs additional header */);
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
+      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
+      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
+      kTestTimeoutConfig, *kAllowedLocationsDefault,
+      nullptr /* gcs additional header */, false /* compose append */);
 
   TF_EXPECT_OK(fs.RenameFile("gs://bucket/path1", "gs://bucket/path2/"));
 }
@@ -2603,7 +2605,7 @@ TEST(GcsFileSystemTest, RenameFile_Object) {
       0 /* stat cache max entries */, 0 /* matching paths cache max age */,
       0 /* matching paths cache max entries */, kTestRetryConfig,
       kTestTimeoutConfig, *kAllowedLocationsDefault,
-      nullptr /* gcs additional header */);
+      nullptr /* gcs additional header */, false /* compose append */);
   // Do an initial read of the source and destination files to load their
   // contents into the block cache.
   char scratch[100];
@@ -2684,7 +2686,7 @@ TEST(GcsFileSystemTest, RenameFile_Object_FlushTargetStatCache) {
       0 /* stat cache max entries */, 0 /* matching paths cache max age */,
       0 /* matching paths cache max entries */, kTestRetryConfig,
       kTestTimeoutConfig, *kAllowedLocationsDefault,
-      nullptr /* gcs additional header */);
+      nullptr /* gcs additional header */, false /* compose append */);
   // Do an initial stat of the destination file to load their contents into the
   // stat cache.
   FileStatistics stat_before_renaming;
@@ -2742,16 +2744,16 @@ TEST(GcsFileSystemTest, RenameFile_Object_DeletionRetried) {
            "Timeouts: 5 1 10\n"
            "Delete: yes\n",
            "", errors::NotFound("404"), 404)});
-  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-                   std::unique_ptr<HttpRequest::Factory>(
-                       new FakeHttpRequestFactory(&requests)),
-                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
-                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
-                   0 /* stat cache max age */, 0 /* stat cache max entries */,
-                   0 /* matching paths cache max age */,
-                   0 /* matching paths cache max entries */, kTestRetryConfig,
-                   kTestTimeoutConfig, *kAllowedLocationsDefault,
-                   nullptr /* gcs additional header */);
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
+      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
+      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
+      kTestTimeoutConfig, *kAllowedLocationsDefault,
+      nullptr /* gcs additional header */, false /* compose append */);
 
   TF_EXPECT_OK(
       fs.RenameFile("gs://bucket/path/src.txt", "gs://bucket/path/dst.txt"));
@@ -2784,16 +2786,16 @@ TEST(GcsFileSystemTest, RenameFile_Object_Incomplete) {
            "Post: yes\n"
            "Timeouts: 5 1 10\n",
            "{\"done\": false}")});
-  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-                   std::unique_ptr<HttpRequest::Factory>(
-                       new FakeHttpRequestFactory(&requests)),
-                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
-                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
-                   0 /* stat cache max age */, 0 /* stat cache max entries */,
-                   0 /* matching paths cache max age */,
-                   0 /* matching paths cache max entries */, kTestRetryConfig,
-                   kTestTimeoutConfig, *kAllowedLocationsDefault,
-                   nullptr /* gcs additional header */);
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
+      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
+      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
+      kTestTimeoutConfig, *kAllowedLocationsDefault,
+      nullptr /* gcs additional header */, false /* compose append */);
 
   EXPECT_EQ(
       errors::Code::UNIMPLEMENTED,
@@ -2809,16 +2811,16 @@ TEST(GcsFileSystemTest, Stat_Object) {
       "Timeouts: 5 1 10\n",
       strings::StrCat("{\"size\": \"1010\",\"generation\": \"1\","
                       "\"updated\": \"2016-04-29T23:15:24.896Z\"}"))});
-  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-                   std::unique_ptr<HttpRequest::Factory>(
-                       new FakeHttpRequestFactory(&requests)),
-                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
-                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
-                   0 /* stat cache max age */, 0 /* stat cache max entries */,
-                   0 /* matching paths cache max age */,
-                   0 /* matching paths cache max entries */, kTestRetryConfig,
-                   kTestTimeoutConfig, *kAllowedLocationsDefault,
-                   nullptr /* gcs additional header */);
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
+      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
+      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
+      kTestTimeoutConfig, *kAllowedLocationsDefault,
+      nullptr /* gcs additional header */, false /* compose append */);
 
   FileStatistics stat;
   TF_EXPECT_OK(fs.Stat("gs://bucket/file.txt", &stat));
@@ -2843,16 +2845,16 @@ TEST(GcsFileSystemTest, Stat_Folder) {
            "Timeouts: 5 1 10\n",
            "{\"items\": [ "
            "  { \"name\": \"subfolder/\" }]}")});
-  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-                   std::unique_ptr<HttpRequest::Factory>(
-                       new FakeHttpRequestFactory(&requests)),
-                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
-                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
-                   0 /* stat cache max age */, 0 /* stat cache max entries */,
-                   0 /* matching paths cache max age */,
-                   0 /* matching paths cache max entries */, kTestRetryConfig,
-                   kTestTimeoutConfig, *kAllowedLocationsDefault,
-                   nullptr /* gcs additional header */);
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
+      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
+      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
+      kTestTimeoutConfig, *kAllowedLocationsDefault,
+      nullptr /* gcs additional header */, false /* compose append */);
 
   FileStatistics stat;
   TF_EXPECT_OK(fs.Stat("gs://bucket/subfolder", &stat));
@@ -2876,16 +2878,16 @@ TEST(GcsFileSystemTest, Stat_ObjectOrFolderNotFound) {
            "Auth Token: fake_token\n"
            "Timeouts: 5 1 10\n",
            "{}")});
-  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-                   std::unique_ptr<HttpRequest::Factory>(
-                       new FakeHttpRequestFactory(&requests)),
-                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
-                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
-                   0 /* stat cache max age */, 0 /* stat cache max entries */,
-                   0 /* matching paths cache max age */,
-                   0 /* matching paths cache max entries */, kTestRetryConfig,
-                   kTestTimeoutConfig, *kAllowedLocationsDefault,
-                   nullptr /* gcs additional header */);
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
+      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
+      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
+      kTestTimeoutConfig, *kAllowedLocationsDefault,
+      nullptr /* gcs additional header */, false /* compose append */);
 
   FileStatistics stat;
   EXPECT_EQ(error::Code::NOT_FOUND, fs.Stat("gs://bucket/path", &stat).code());
@@ -2897,16 +2899,16 @@ TEST(GcsFileSystemTest, Stat_Bucket) {
       "Auth Token: fake_token\n"
       "Timeouts: 5 1 10\n",
       "{}")});
-  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-                   std::unique_ptr<HttpRequest::Factory>(
-                       new FakeHttpRequestFactory(&requests)),
-                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
-                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
-                   0 /* stat cache max age */, 0 /* stat cache max entries */,
-                   0 /* matching paths cache max age */,
-                   0 /* matching paths cache max entries */, kTestRetryConfig,
-                   kTestTimeoutConfig, *kAllowedLocationsDefault,
-                   nullptr /* gcs additional header */);
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
+      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
+      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
+      kTestTimeoutConfig, *kAllowedLocationsDefault,
+      nullptr /* gcs additional header */, false /* compose append */);
 
   FileStatistics stat;
   TF_EXPECT_OK(fs.Stat("gs://bucket/", &stat));
@@ -2921,16 +2923,16 @@ TEST(GcsFileSystemTest, Stat_BucketNotFound) {
       "Auth Token: fake_token\n"
       "Timeouts: 5 1 10\n",
       "", errors::NotFound("404"), 404)});
-  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-                   std::unique_ptr<HttpRequest::Factory>(
-                       new FakeHttpRequestFactory(&requests)),
-                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
-                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
-                   0 /* stat cache max age */, 0 /* stat cache max entries */,
-                   0 /* matching paths cache max age */,
-                   0 /* matching paths cache max entries */, kTestRetryConfig,
-                   kTestTimeoutConfig, *kAllowedLocationsDefault,
-                   nullptr /* gcs additional header */);
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
+      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
+      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
+      kTestTimeoutConfig, *kAllowedLocationsDefault,
+      nullptr /* gcs additional header */, false /* compose append */);
 
   FileStatistics stat;
   EXPECT_EQ(error::Code::NOT_FOUND, fs.Stat("gs://bucket/", &stat).code());
@@ -2968,7 +2970,7 @@ TEST(GcsFileSystemTest, Stat_Cache) {
       0 /* stat cache max entries */, 0 /* matching paths cache max age */,
       0 /* matching paths cache max entries */, kTestRetryConfig,
       kTestTimeoutConfig, *kAllowedLocationsDefault,
-      nullptr /* gcs additional header */);
+      nullptr /* gcs additional header */, false /* compose append */);
 
   // Repeated calls to fs.Stat on these paths should not lead to any additional
   // HTTP requests to GCS.
@@ -3010,7 +3012,7 @@ TEST(GcsFileSystemTest, Stat_Cache_Flush) {
       0 /* stat cache max entries */, 0 /* matching paths cache max age */,
       0 /* matching paths cache max entries */, kTestRetryConfig,
       kTestTimeoutConfig, *kAllowedLocationsDefault,
-      nullptr /* gcs additional header */);
+      nullptr /* gcs additional header */, false /* compose append */);
   // There should be a single HTTP request to GCS for fs.Stat in this loop.
   for (int i = 0; i < 10; i++) {
     FileStatistics stat;
@@ -3038,16 +3040,16 @@ TEST(GcsFileSystemTest, Stat_FilenameEndingWithSlash) {
       "Timeouts: 5 1 10\n",
       strings::StrCat("{\"size\": \"5\",\"generation\": \"1\","
                       "\"updated\": \"2016-04-29T23:15:24.896Z\"}"))});
-  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-                   std::unique_ptr<HttpRequest::Factory>(
-                       new FakeHttpRequestFactory(&requests)),
-                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
-                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
-                   0 /* stat cache max age */, 0 /* stat cache max entries */,
-                   0 /* matching paths cache max age */,
-                   0 /* matching paths cache max entries */, kTestRetryConfig,
-                   kTestTimeoutConfig, *kAllowedLocationsDefault,
-                   nullptr /* gcs additional header */);
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
+      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
+      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
+      kTestTimeoutConfig, *kAllowedLocationsDefault,
+      nullptr /* gcs additional header */, false /* compose append */);
 
   FileStatistics stat;
   TF_EXPECT_OK(fs.Stat("gs://bucket/dir/", &stat));
@@ -3070,16 +3072,16 @@ TEST(GcsFileSystemTest, IsDirectory_NotFound) {
            "Auth Token: fake_token\n"
            "Timeouts: 5 1 10\n",
            "", errors::NotFound("404"), 404)});
-  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-                   std::unique_ptr<HttpRequest::Factory>(
-                       new FakeHttpRequestFactory(&requests)),
-                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
-                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
-                   0 /* stat cache max age */, 0 /* stat cache max entries */,
-                   0 /* matching paths cache max age */,
-                   0 /* matching paths cache max entries */, kTestRetryConfig,
-                   kTestTimeoutConfig, *kAllowedLocationsDefault,
-                   nullptr /* gcs additional header */);
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
+      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
+      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
+      kTestTimeoutConfig, *kAllowedLocationsDefault,
+      nullptr /* gcs additional header */, false /* compose append */);
 
   EXPECT_EQ(error::Code::NOT_FOUND,
             fs.IsDirectory("gs://bucket/file.txt").code());
@@ -3101,16 +3103,16 @@ TEST(GcsFileSystemTest, IsDirectory_NotDirectoryButObject) {
            "Timeouts: 5 1 10\n",
            strings::StrCat("{\"size\": \"1010\",\"generation\": \"1\","
                            "\"updated\": \"2016-04-29T23:15:24.896Z\"}"))});
-  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-                   std::unique_ptr<HttpRequest::Factory>(
-                       new FakeHttpRequestFactory(&requests)),
-                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
-                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
-                   0 /* stat cache max age */, 0 /* stat cache max entries */,
-                   0 /* matching paths cache max age */,
-                   0 /* matching paths cache max entries */, kTestRetryConfig,
-                   kTestTimeoutConfig, *kAllowedLocationsDefault,
-                   nullptr /* gcs additional header */);
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
+      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
+      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
+      kTestTimeoutConfig, *kAllowedLocationsDefault,
+      nullptr /* gcs additional header */, false /* compose append */);
 
   EXPECT_EQ(error::Code::FAILED_PRECONDITION,
             fs.IsDirectory("gs://bucket/file.txt").code());
@@ -3132,16 +3134,16 @@ TEST(GcsFileSystemTest, IsDirectory_Yes) {
            "Auth Token: fake_token\n"
            "Timeouts: 5 1 10\n",
            "{\"items\": [{\"name\": \"subfolder/\"}]}")});
-  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-                   std::unique_ptr<HttpRequest::Factory>(
-                       new FakeHttpRequestFactory(&requests)),
-                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
-                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
-                   0 /* stat cache max age */, 0 /* stat cache max entries */,
-                   0 /* matching paths cache max age */,
-                   0 /* matching paths cache max entries */, kTestRetryConfig,
-                   kTestTimeoutConfig, *kAllowedLocationsDefault,
-                   nullptr /* gcs additional header */);
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
+      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
+      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
+      kTestTimeoutConfig, *kAllowedLocationsDefault,
+      nullptr /* gcs additional header */, false /* compose append */);
 
   TF_EXPECT_OK(fs.IsDirectory("gs://bucket/subfolder"));
   TF_EXPECT_OK(fs.IsDirectory("gs://bucket/subfolder/"));
@@ -3159,16 +3161,16 @@ TEST(GcsFileSystemTest, IsDirectory_Bucket) {
            "Auth Token: fake_token\n"
            "Timeouts: 5 1 10\n",
            "{}")});
-  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-                   std::unique_ptr<HttpRequest::Factory>(
-                       new FakeHttpRequestFactory(&requests)),
-                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
-                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
-                   0 /* stat cache max age */, 0 /* stat cache max entries */,
-                   0 /* matching paths cache max age */,
-                   0 /* matching paths cache max entries */, kTestRetryConfig,
-                   kTestTimeoutConfig, *kAllowedLocationsDefault,
-                   nullptr /* gcs additional header */);
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
+      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
+      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
+      kTestTimeoutConfig, *kAllowedLocationsDefault,
+      nullptr /* gcs additional header */, false /* compose append */);
 
   TF_EXPECT_OK(fs.IsDirectory("gs://bucket"));
   TF_EXPECT_OK(fs.IsDirectory("gs://bucket/"));
@@ -3180,16 +3182,16 @@ TEST(GcsFileSystemTest, IsDirectory_BucketNotFound) {
       "Auth Token: fake_token\n"
       "Timeouts: 5 1 10\n",
       "", errors::NotFound("404"), 404)});
-  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-                   std::unique_ptr<HttpRequest::Factory>(
-                       new FakeHttpRequestFactory(&requests)),
-                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
-                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
-                   0 /* stat cache max age */, 0 /* stat cache max entries */,
-                   0 /* matching paths cache max age */,
-                   0 /* matching paths cache max entries */, kTestRetryConfig,
-                   kTestTimeoutConfig, *kAllowedLocationsDefault,
-                   nullptr /* gcs additional header */);
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
+      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
+      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
+      kTestTimeoutConfig, *kAllowedLocationsDefault,
+      nullptr /* gcs additional header */, false /* compose append */);
 
   EXPECT_EQ(error::Code::NOT_FOUND, fs.IsDirectory("gs://bucket/").code());
 }
@@ -3222,16 +3224,16 @@ TEST(GcsFileSystemTest, CreateDir_Folder) {
            "Timeouts: 5 1 10\n",
            strings::StrCat("{\"size\": \"1010\",\"generation\": \"1\","
                            "\"updated\": \"2016-04-29T23:15:24.896Z\"}"))});
-  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-                   std::unique_ptr<HttpRequest::Factory>(
-                       new FakeHttpRequestFactory(&requests)),
-                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
-                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
-                   0 /* stat cache max age */, 0 /* stat cache max entries */,
-                   0 /* matching paths cache max age */,
-                   0 /* matching paths cache max entries */, kTestRetryConfig,
-                   kTestTimeoutConfig, *kAllowedLocationsDefault,
-                   nullptr /* gcs additional header */);
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
+      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
+      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
+      kTestTimeoutConfig, *kAllowedLocationsDefault,
+      nullptr /* gcs additional header */, false /* compose append */);
 
   TF_EXPECT_OK(fs.CreateDir("gs://bucket/subpath"));
   EXPECT_EQ(errors::AlreadyExists("gs://bucket/subpath/"),
@@ -3250,16 +3252,16 @@ TEST(GcsFileSystemTest, CreateDir_Bucket) {
            "Auth Token: fake_token\n"
            "Timeouts: 5 1 10\n",
            "")});
-  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-                   std::unique_ptr<HttpRequest::Factory>(
-                       new FakeHttpRequestFactory(&requests)),
-                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
-                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
-                   0 /* stat cache max age */, 0 /* stat cache max entries */,
-                   0 /* matching paths cache max age */,
-                   0 /* matching paths cache max entries */, kTestRetryConfig,
-                   kTestTimeoutConfig, *kAllowedLocationsDefault,
-                   nullptr /* gcs additional header */);
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
+      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
+      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
+      kTestTimeoutConfig, *kAllowedLocationsDefault,
+      nullptr /* gcs additional header */, false /* compose append */);
 
   TF_EXPECT_OK(fs.CreateDir("gs://bucket/"));
   TF_EXPECT_OK(fs.CreateDir("gs://bucket"));
@@ -3322,16 +3324,16 @@ TEST(GcsFileSystemTest, DeleteRecursively_Ok) {
                            "Timeouts: 5 1 10\n"
                            "Delete: yes\n",
                            "")});
-  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-                   std::unique_ptr<HttpRequest::Factory>(
-                       new FakeHttpRequestFactory(&requests)),
-                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
-                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
-                   0 /* stat cache max age */, 0 /* stat cache max entries */,
-                   0 /* matching paths cache max age */,
-                   0 /* matching paths cache max entries */, kTestRetryConfig,
-                   kTestTimeoutConfig, *kAllowedLocationsDefault,
-                   nullptr /* gcs additional header */);
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
+      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
+      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
+      kTestTimeoutConfig, *kAllowedLocationsDefault,
+      nullptr /* gcs additional header */, false /* compose append */);
 
   int64 undeleted_files, undeleted_dirs;
   TF_EXPECT_OK(fs.DeleteRecursively("gs://bucket/path", &undeleted_files,
@@ -3415,16 +3417,16 @@ TEST(GcsFileSystemTest, DeleteRecursively_DeletionErrors) {
            "Timeouts: 5 1 10\n",
            "", errors::NotFound("404"), 404)});
 
-  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-                   std::unique_ptr<HttpRequest::Factory>(
-                       new FakeHttpRequestFactory(&requests)),
-                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
-                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
-                   0 /* stat cache max age */, 0 /* stat cache max entries */,
-                   0 /* matching paths cache max age */,
-                   0 /* matching paths cache max entries */, kTestRetryConfig,
-                   kTestTimeoutConfig, *kAllowedLocationsDefault,
-                   nullptr /* gcs additional header */);
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
+      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
+      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
+      kTestTimeoutConfig, *kAllowedLocationsDefault,
+      nullptr /* gcs additional header */, false /* compose append */);
 
   int64 undeleted_files, undeleted_dirs;
   TF_EXPECT_OK(fs.DeleteRecursively("gs://bucket/path", &undeleted_files,
@@ -3450,16 +3452,16 @@ TEST(GcsFileSystemTest, DeleteRecursively_NotAFolder) {
            "Auth Token: fake_token\n"
            "Timeouts: 5 1 10\n",
            "", errors::NotFound("404"), 404)});
-  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-                   std::unique_ptr<HttpRequest::Factory>(
-                       new FakeHttpRequestFactory(&requests)),
-                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
-                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
-                   0 /* stat cache max age */, 0 /* stat cache max entries */,
-                   0 /* matching paths cache max age */,
-                   0 /* matching paths cache max entries */, kTestRetryConfig,
-                   kTestTimeoutConfig, *kAllowedLocationsDefault,
-                   nullptr /* gcs additional header */);
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
+      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
+      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
+      kTestTimeoutConfig, *kAllowedLocationsDefault,
+      nullptr /* gcs additional header */, false /* compose append */);
 
   int64 undeleted_files, undeleted_dirs;
   EXPECT_EQ(error::Code::NOT_FOUND,
@@ -3543,7 +3545,7 @@ TEST(GcsFileSystemTest, AdditionalRequestHeaderTest) {
       0 /* stat cache max entries */, 0 /* matching paths cache max age */,
       0 /* matching paths cache max entries */, kTestRetryConfig,
       kTestTimeoutConfig, *kAllowedLocationsDefault,
-      add_header /* gcs additional header */);
+      add_header /* gcs additional header */, false /* compose append */);
 
   std::unique_ptr<HttpRequest> request;
   TF_EXPECT_OK(fs7.CreateHttpRequest(&request));
@@ -3613,16 +3615,16 @@ TEST(GcsFileSystemTest, CreateHttpRequest) {
                            "Auth Token: fake_token\n"
                            "Header Hello: world\n",
                            "{}")});
-  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-                   std::unique_ptr<HttpRequest::Factory>(
-                       new FakeHttpRequestFactory(&requests)),
-                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
-                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
-                   0 /* stat cache max age */, 0 /* stat cache max entries */,
-                   0 /* matching paths cache max age */,
-                   0 /* matching paths cache max entries */, kTestRetryConfig,
-                   kTestTimeoutConfig, *kAllowedLocationsDefault,
-                   nullptr /* gcs additional header */);
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
+      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
+      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
+      kTestTimeoutConfig, *kAllowedLocationsDefault,
+      nullptr /* gcs additional header */, false /* compose append */);
 
   std::unique_ptr<HttpRequest> request;
   TF_EXPECT_OK(fs.CreateHttpRequest(&request));
@@ -3676,16 +3678,16 @@ TEST(GcsFileSystemTest, Stat_StatsRecording) {
       "Timeouts: 5 1 10\n",
       strings::StrCat("{\"size\": \"1010\",\"generation\": \"1\","
                       "\"updated\": \"2016-04-29T23:15:24.896Z\"}"))});
-  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-                   std::unique_ptr<HttpRequest::Factory>(
-                       new FakeHttpRequestFactory(&requests)),
-                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
-                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
-                   0 /* stat cache max age */, 0 /* stat cache max entries */,
-                   0 /* matching paths cache max age */,
-                   0 /* matching paths cache max entries */, kTestRetryConfig,
-                   kTestTimeoutConfig, *kAllowedLocationsDefault,
-                   nullptr /* gcs additional header */);
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
+      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
+      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
+      kTestTimeoutConfig, *kAllowedLocationsDefault,
+      nullptr /* gcs additional header */, false /* compose append */);
 
   TestGcsStats stats;
   fs.SetStats(&stats);
@@ -3703,16 +3705,16 @@ TEST(GcsFileSystemTest, NewRandomAccessFile_StatsRecording) {
       "Range: 0-5\n"
       "Timeouts: 5 1 20\n",
       "012345")});
-  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-                   std::unique_ptr<HttpRequest::Factory>(
-                       new FakeHttpRequestFactory(&requests)),
-                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
-                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
-                   0 /* stat cache max age */, 0 /* stat cache max entries */,
-                   0 /* matching paths cache max age */,
-                   0 /* matching paths cache max entries */, kTestRetryConfig,
-                   kTestTimeoutConfig, *kAllowedLocationsDefault,
-                   nullptr /* gcs additional header */);
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
+      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
+      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
+      kTestTimeoutConfig, *kAllowedLocationsDefault,
+      nullptr /* gcs additional header */, false /* compose append */);
 
   TestGcsStats stats;
   fs.SetStats(&stats);
@@ -3732,5 +3734,253 @@ TEST(GcsFileSystemTest, NewRandomAccessFile_StatsRecording) {
   EXPECT_EQ(6, stats.block_retrieved_bytes_transferred_);
 }
 
+TEST(GcsFileSystemTest, NewAppendableFile_MultipleFlushesWithCompose) {
+  std::vector<string> contents(
+      {"content0,", "content1,", "content2,", "content3,"});
+  std::vector<HttpRequest*> requests({
+      // Fetch the file (stats and then content)
+      new FakeHttpRequest(
+          "Uri: "
+          "https://www.googleapis.com/storage/v1/b/bucket/o/"
+          "some%2Fpath%2Fappendable?fields=size%2Cgeneration%2Cupdated\n"
+          "Auth Token: fake_token\n"
+          "Timeouts: 5 1 10\n",
+          strings::StrCat("{\"size\": \"8\",\"generation\": \"1\","
+                          "\"updated\": \"2016-04-29T23:15:24.896Z\"}")),
+      new FakeHttpRequest(
+          "Uri: "
+          "https://storage.googleapis.com/bucket/some%2Fpath%2Fappendable\n"
+          "Auth Token: fake_token\n"
+          "Range: 0-1048575\n"
+          "Timeouts: 5 1 20\n",
+          contents[0]),
+      // Upload entire file
+      new FakeHttpRequest(
+          "Uri: https://www.googleapis.com/upload/storage/v1/b/bucket/o?"
+          "uploadType=resumable&name=some%2Fpath%2Fappendable\n"
+          "Auth Token: fake_token\n"
+          "Header X-Upload-Content-Length: 18\n"
+          "Post: yes\n"
+          "Timeouts: 5 1 10\n",
+          "", {{"Location", "https://custom/upload/location"}}),
+      new FakeHttpRequest(
+          strings::StrCat("Uri: https://custom/upload/location\n"
+                          "Auth Token: fake_token\n"
+                          "Header Content-Range: bytes 0-17/18\n"
+                          "Timeouts: 5 1 30\n"
+                          "Put body: ",
+                          contents[0], contents[1], "\n"),
+          ""),
+      // Upload new part to a temporary object
+      new FakeHttpRequest(
+          "Uri: "
+          "https://www.googleapis.com/upload/storage/v1/b/bucket/"
+          "o?uploadType=resumable&name=some%2Fpath%2F.tmpcompose%2Fappendable."
+          "18\n"
+          "Auth Token: fake_token\n"
+          "Header X-Upload-Content-Length: 9\n"
+          "Post: yes\n"
+          "Timeouts: 5 1 10\n",
+          "",
+          {{"Location",
+            "https://custom/upload/"
+            "location"}}),
+      new FakeHttpRequest(
+          strings::StrCat("Uri: https://custom/upload/location\n"
+                          "Auth Token: fake_token\n"
+                          "Header Content-Range: bytes 0-8/9\n"
+                          "Timeouts: 5 1 30\n"
+                          "Put body: ",
+                          contents[2], "\n"),
+          ""),
+      // Compose the new part at the end of the original object.
+      new FakeHttpRequest("Uri: "
+                          "https://www.googleapis.com/storage/v1/b/bucket/o/"
+                          "some%2Fpath%2Fappendable/compose\n"
+                          "Auth Token: fake_token\n"
+                          "Timeouts: 5 1 10\n"
+                          "Header content-type: application/json\n"
+                          "Post body: {'sourceObjects': [{'name': "
+                          "'some/path/appendable'},{'name': "
+                          "'some/path/.tmpcompose/appendable.18'}]}\n",
+                          ""),
+      // Delete the temporary object.
+      new FakeHttpRequest("Uri: "
+                          "https://www.googleapis.com/storage/v1/b/bucket/o/"
+                          "some%2Fpath%2F.tmpcompose%2Fappendable.18\n"
+                          "Auth Token: fake_token\n"
+                          "Timeouts: 5 1 10\n"
+                          "Delete: yes\n",
+                          ""),
+      new FakeHttpRequest(
+          "Uri: https://www.googleapis.com/upload/storage/v1/b/bucket/o?"
+          "uploadType=resumable&name=some%2Fpath%2F.tmpcompose%2Fappendable."
+          "27\n"
+          "Auth Token: fake_token\n"
+          "Header X-Upload-Content-Length: 9\n"
+          "Post: yes\n"
+          "Timeouts: 5 1 10\n",
+          "", {{"Location", "https://custom/upload/location"}}),
+      new FakeHttpRequest(
+          strings::StrCat("Uri: https://custom/upload/location\n"
+                          "Auth Token: fake_token\n"
+                          "Header Content-Range: bytes 0-8/9\n"
+                          "Timeouts: 5 1 30\n"
+                          "Put body: ",
+                          contents[3], "\n"),
+          ""),
+      new FakeHttpRequest("Uri: "
+                          "https://www.googleapis.com/storage/v1/b/bucket/o/"
+                          "some%2Fpath%2Fappendable/compose\n"
+                          "Auth Token: fake_token\n"
+                          "Timeouts: 5 1 10\n"
+                          "Header content-type: application/json\n"
+                          "Post body: {'sourceObjects': [{'name': "
+                          "'some/path/appendable'},{'name': "
+                          "'some/path/.tmpcompose/appendable.27'}]}\n",
+                          ""),
+      new FakeHttpRequest("Uri: "
+                          "https://www.googleapis.com/storage/v1/b/bucket/o/"
+                          "some%2Fpath%2F.tmpcompose%2Fappendable."
+                          "27\n"
+                          "Auth Token: fake_token\n"
+                          "Timeouts: 5 1 10\n"
+                          "Delete: yes\n",
+                          ""),
+  });
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 32 /* block size */,
+      32 /* max bytes */, 0 /* max staleness */, 3600 /* stat cache max age */,
+      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
+      kTestTimeoutConfig, *kAllowedLocationsDefault,
+      nullptr /* gcs additional header */, true /* compose append */);
+
+  // Create an appendable file. This should read the file from GCS, and pull its
+  // contents into the block cache.
+  std::unique_ptr<WritableFile> wfile;
+  TF_EXPECT_OK(
+      fs.NewAppendableFile("gs://bucket/some/path/appendable", &wfile));
+  TF_EXPECT_OK(wfile->Append(contents[1]));
+  TF_EXPECT_OK(wfile->Flush());
+  TF_EXPECT_OK(wfile->Append(contents[2]));
+  TF_EXPECT_OK(wfile->Flush());
+  TF_EXPECT_OK(wfile->Append(contents[3]));
+  TF_EXPECT_OK(wfile->Close());
+}
+
+TEST(GcsFileSystemTest, NewAppendableFile_MultipleFlushesWithoutCompose) {
+  std::vector<string> contents(
+      {"content0,", "content1,", "content2,", "content3,"});
+  std::vector<HttpRequest*> requests({
+      new FakeHttpRequest(
+          "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
+          "path%2Fappendable?fields=size%2Cgeneration%2Cupdated\n"
+          "Auth Token: fake_token\n"
+          "Timeouts: 5 1 10\n",
+          strings::StrCat("{\"size\": \"8\",\"generation\": \"1\","
+                          "\"updated\": \"2016-04-29T23:15:24.896Z\"}")),
+      new FakeHttpRequest(
+          "Uri: https://storage.googleapis.com/bucket/path%2Fappendable\n"
+          "Auth Token: fake_token\n"
+          "Range: 0-1048575\n"
+          "Timeouts: 5 1 20\n",
+          contents[0]),
+      new FakeHttpRequest(
+          "Uri: https://www.googleapis.com/upload/storage/v1/b/bucket/o?"
+          "uploadType=resumable&name=path%2Fappendable\n"
+          "Auth Token: fake_token\n"
+          "Header X-Upload-Content-Length: 18\n"
+          "Post: yes\n"
+          "Timeouts: 5 1 10\n",
+          "", {{"Location", "https://custom/upload/location"}}),
+      // Uploads entire file.
+      new FakeHttpRequest(
+          strings::StrCat("Uri: https://custom/upload/location\n"
+                          "Auth Token: fake_token\n"
+                          "Header Content-Range: bytes 0-17/18\n"
+                          "Timeouts: 5 1 30\n"
+                          "Put body: ",
+                          contents[0], contents[1], "\n"),
+          ""),
+      new FakeHttpRequest("Uri: "
+                          "https://www.googleapis.com/upload/storage/v1/b/"
+                          "bucket/o?"
+                          "uploadType=resumable&name=path%2Fappendable\n"
+                          "Auth Token: fake_token\n"
+                          "Header X-Upload-Content-Length: 27\n"
+                          "Post: yes\n"
+                          "Timeouts: 5 1 10\n",
+                          "",
+                          {{"Location",
+                            "https://custom/upload/"
+                            "location"}}),
+      // Uploads entire file again.
+      new FakeHttpRequest(
+          strings::StrCat("Uri: https://custom/upload/location\n"
+                          "Auth Token: fake_token\n"
+                          "Header Content-Range: bytes 0-26/27\n"
+                          "Timeouts: 5 1 30\n"
+                          "Put body: ",
+                          contents[0], contents[1], contents[2], "\n"),
+          ""),
+      new FakeHttpRequest(
+          "Uri: https://www.googleapis.com/upload/storage/v1/b/bucket/o?"
+          "uploadType=resumable&name=path%2Fappendable\n"
+          "Auth Token: fake_token\n"
+          "Header X-Upload-Content-Length: 36\n"
+          "Post: yes\n"
+          "Timeouts: 5 1 10\n",
+          "", {{"Location", "https://custom/upload/location"}}),
+      // Uploads entire file again.
+      new FakeHttpRequest(
+          strings::StrCat("Uri: https://custom/upload/location\n"
+                          "Auth Token: fake_token\n"
+                          "Header Content-Range: bytes 0-35/36\n"
+                          "Timeouts: 5 1 30\n"
+                          "Put body: ",
+                          contents[0], contents[1], contents[2], contents[3],
+                          "\n"),
+          ""),
+  });
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 32 /* block size */,
+      32 /* max bytes */, 0 /* max staleness */, 3600 /* stat cache max age */,
+      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
+      kTestTimeoutConfig, *kAllowedLocationsDefault,
+      nullptr /* gcs additional header */, false /* compose append */);
+
+  // Create an appendable file. This should read the file from GCS, and pull its
+  // contents into the block cache.
+  std::unique_ptr<WritableFile> wfile;
+  TF_EXPECT_OK(fs.NewAppendableFile("gs://bucket/path/appendable", &wfile));
+  TF_EXPECT_OK(wfile->Append(contents[1]));
+  TF_EXPECT_OK(wfile->Flush());
+  TF_EXPECT_OK(wfile->Append(contents[2]));
+  TF_EXPECT_OK(wfile->Flush());
+  TF_EXPECT_OK(wfile->Append(contents[3]));
+  TF_EXPECT_OK(wfile->Close());
+}
+
+TEST(GcsFileSystemTest, AppendModeCompose) {
+  unsetenv("GCS_APPEND_MODE");
+  setenv("GCS_APPEND_MODE", "compose", 1);
+  GcsFileSystem fs1;
+  EXPECT_EQ(true, fs1.compose_append());
+}
+
+TEST(GcsFileSystemTest, AppendModeDefault) {
+  unsetenv("GCS_APPEND_MODE");
+  GcsFileSystem fs1;
+  EXPECT_EQ(false, fs1.compose_append());
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/cpu_feature_guard.cc b/tensorflow/core/platform/cpu_feature_guard.cc
index c5a5c287283..13116de49dc 100644
--- a/tensorflow/core/platform/cpu_feature_guard.cc
+++ b/tensorflow/core/platform/cpu_feature_guard.cc
@@ -139,7 +139,8 @@ void InfoAboutUnusedCPUFeatures() {
 #endif  // __FMA__
 #endif  // else of if defined(_MSC_VER) && !defined(__clang__)
     if (!missing_instructions.empty()) {
-      LOG(INFO) << "This TensorFlow binary is optimized with Intel(R) MKL-DNN "
+      LOG(INFO) << "This TensorFlow binary is optimized with "
+                << "oneAPI Deep Neural Network Library (oneDNN)"
                 << "to use the following CPU instructions in performance-"
                 << "critical operations: " << missing_instructions << std::endl
                 << "To enable them in other operations, rebuild TensorFlow "
diff --git a/tensorflow/core/platform/default/distribute.bzl b/tensorflow/core/platform/default/distribute.bzl
index 46a5d826a79..b16d5e8cff7 100644
--- a/tensorflow/core/platform/default/distribute.bzl
+++ b/tensorflow/core/platform/default/distribute.bzl
@@ -22,6 +22,7 @@ def distribute_py_test(
         full_precision = False,
         disable_v2 = False,
         disable_v3 = False,
+        disable_mlir_bridge = True,
         **kwargs):
     """Generates py_test targets for CPU and GPU.
 
@@ -40,6 +41,7 @@ def distribute_py_test(
         full_precision: unused.
         disable_v2: whether tests for TPU version 2 should be generated.
         disable_v3: whether tests for TPU version 3 should be generated.
+        disable_mlir_bridge: whether to also run this with the mlir bridge enabled.
         **kwargs: extra keyword arguments to the non-tpu test.
     """
 
@@ -77,6 +79,7 @@ def distribute_py_test(
             tags = tpu_tags,
             disable_v2 = disable_v2,
             disable_v3 = disable_v3,
+            disable_mlir_bridge = disable_mlir_bridge,
         )
 
 register_extension_info(
diff --git a/tensorflow/core/platform/default/posix_file_system.cc b/tensorflow/core/platform/default/posix_file_system.cc
index 106a0412fb7..05c2b2762d4 100644
--- a/tensorflow/core/platform/default/posix_file_system.cc
+++ b/tensorflow/core/platform/default/posix_file_system.cc
@@ -51,7 +51,11 @@ class PosixRandomAccessFile : public RandomAccessFile {
  public:
   PosixRandomAccessFile(const string& fname, int fd)
       : filename_(fname), fd_(fd) {}
-  ~PosixRandomAccessFile() override { close(fd_); }
+  ~PosixRandomAccessFile() override {
+    if (close(fd_) < 0) {
+      LOG(ERROR) << "close() failed: " << strerror(errno);
+    }
+  }
 
   Status Name(StringPiece* result) const override {
     *result = filename_;
@@ -229,7 +233,9 @@ Status PosixFileSystem::NewReadOnlyMemoryRegionFromFile(
     } else {
       result->reset(new PosixReadOnlyMemoryRegion(address, st.st_size));
     }
-    close(fd);
+    if (close(fd) < 0) {
+      s = IOError(fname, errno);
+    }
   }
   return s;
 }
@@ -256,7 +262,9 @@ Status PosixFileSystem::GetChildren(const string& dir,
       result->push_back(entry->d_name);
     }
   }
-  closedir(d);
+  if (closedir(d) < 0) {
+    return IOError(dir, errno);
+  }
   return Status::OK();
 }
 
diff --git a/tensorflow/core/platform/default/subprocess.cc b/tensorflow/core/platform/default/subprocess.cc
index 562f4cd2d0c..acf7073b9a4 100644
--- a/tensorflow/core/platform/default/subprocess.cc
+++ b/tensorflow/core/platform/default/subprocess.cc
@@ -102,11 +102,15 @@ void SubProcess::FreeArgs() {
 void SubProcess::ClosePipes() {
   for (int i = 0; i < kNFds; i++) {
     if (parent_pipe_[i] >= 0) {
-      close(parent_pipe_[i]);
+      if (close(parent_pipe_[i]) < 0) {
+        LOG(ERROR) << "close() failed: " << strerror(errno);
+      }
       parent_pipe_[i] = -1;
     }
     if (child_pipe_[i] >= 0) {
-      close(child_pipe_[i]);
+      if (close(child_pipe_[i]) < 0) {
+        LOG(ERROR) << "close() failed: " << strerror(errno);
+      }
       child_pipe_[i] = -1;
     }
   }
@@ -215,7 +219,9 @@ bool SubProcess::Start() {
     running_ = true;
     for (int i = 0; i < kNFds; i++) {
       if (child_pipe_[i] >= 0) {
-        close(child_pipe_[i]);
+        if (close(child_pipe_[i]) < 0) {
+          LOG(ERROR) << "close() failed: " << strerror(errno);
+        }
         child_pipe_[i] = -1;
       }
     }
@@ -227,7 +233,9 @@ bool SubProcess::Start() {
   int devnull_fd = -1;
   for (int i = 0; i < kNFds; i++) {
     if (parent_pipe_[i] >= 0) {
-      close(parent_pipe_[i]);
+      if (close(parent_pipe_[i]) < 0) {
+        LOG(ERROR) << "close() failed: " << strerror(errno);
+      }
       parent_pipe_[i] = -1;
     }
 
@@ -242,7 +250,9 @@ bool SubProcess::Start() {
             _exit(1);
           }
         }
-        close(child_pipe_[i]);
+        if (close(child_pipe_[i]) < 0) {
+          LOG(ERROR) << "close() failed: " << strerror(errno);
+        }
         child_pipe_[i] = -1;
         break;
 
@@ -264,14 +274,18 @@ bool SubProcess::Start() {
             }
           }
         } else {
-          close(i);
+          if (close(i) < 0) {
+            LOG(ERROR) << "close() failed: " << strerror(errno);
+          }
         }
         break;
     }
   }
 
   if (devnull_fd >= 0) {
-    close(devnull_fd);
+    if (close(devnull_fd) < 0) {
+      LOG(ERROR) << "close() failed: " << strerror(errno);
+    }
   }
 
   // Execute the child program.
@@ -379,7 +393,9 @@ int SubProcess::Communicate(const string* stdin_input, string* stdout_output,
           // Special case: if no data is given to send to the child process,
           // close the pipe to unblock the child, and skip the file descriptor.
           if (stdin_input == nullptr) {
-            close(parent_pipe_[i]);
+            if (close(parent_pipe_[i]) < 0) {
+              LOG(ERROR) << "close() failed: " << strerror(errno);
+            }
             parent_pipe_[i] = -1;
             continue;
           }
@@ -441,7 +457,9 @@ int SubProcess::Communicate(const string* stdin_input, string* stdout_output,
               fds[i].fd = -1;
               fd_remain--;
               // Close the child's stdin pipe to unblock the process.
-              close(parent_pipe_[CHAN_STDIN]);
+              if (close(parent_pipe_[CHAN_STDIN]) < 0) {
+                LOG(ERROR) << "close() failed: " << strerror(errno);
+              }
               parent_pipe_[CHAN_STDIN] = -1;
             }
           } else if (!retry(errno)) {
diff --git a/tensorflow/core/platform/path.cc b/tensorflow/core/platform/path.cc
index 1e88328aace..a041ac67d72 100644
--- a/tensorflow/core/platform/path.cc
+++ b/tensorflow/core/platform/path.cc
@@ -320,7 +320,9 @@ string GetTempFilename(const string& extension) {
       if (fd < 0) {
         LOG(FATAL) << "Failed to create temp file.";
       } else {
-        close(fd);
+        if (close(fd) < 0) {
+          LOG(ERROR) << "close() failed: " << strerror(errno);
+        }
         return tmp_filepath;
       }
     }
diff --git a/tensorflow/core/platform/platform_strings.cc b/tensorflow/core/platform/platform_strings.cc
index 489a211ccf7..af8787f4fbc 100644
--- a/tensorflow/core/platform/platform_strings.cc
+++ b/tensorflow/core/platform/platform_strings.cc
@@ -52,7 +52,9 @@ int GetPlatformStrings(const std::string& path,
     }
 
     result = (ferror(ifp) == 0) ? 0 : errno;
-    fclose(ifp);
+    if (fclose(ifp) != 0) {
+      result = errno;
+    }
   } else {
     result = errno;
   }
diff --git a/tensorflow/core/platform/profile_utils/android_armv7a_cpu_utils_helper.cc b/tensorflow/core/platform/profile_utils/android_armv7a_cpu_utils_helper.cc
index bf7db553be0..6dc1826d93b 100644
--- a/tensorflow/core/platform/profile_utils/android_armv7a_cpu_utils_helper.cc
+++ b/tensorflow/core/platform/profile_utils/android_armv7a_cpu_utils_helper.cc
@@ -119,10 +119,14 @@ int64 AndroidArmV7ACpuUtilsHelper::ReadCpuFrequencyFile(
   const int retval = fscanf(fp, "%" SCNd64, &freq_in_khz);
   if (retval < 0) {
     LOG(WARNING) << "Failed to \"" << file_path << "\"";
-    fclose(fp);
+    if (fclose(fp) != 0) {
+      LOG(WARNING) << "fclose() failed: " << strerror(errno);
+    }
     return INVALID_CPU_FREQUENCY;
   }
-  fclose(fp);
+  if (fclose(fp) != 0) {
+    LOG(WARNING) << "fclose() failed: " << strerror(errno);
+  }
   return freq_in_khz * 1000;  // The file contains cpu frequency in khz
 }
 
diff --git a/tensorflow/core/platform/tf32_utils.cc b/tensorflow/core/platform/tf32_utils.cc
new file mode 100644
index 00000000000..d2f40ea161a
--- /dev/null
+++ b/tensorflow/core/platform/tf32_utils.cc
@@ -0,0 +1,30 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/platform/tf32_utils.h"
+
+#include <atomic>
+
+namespace tensorflow {
+
+// Whether TensorFloat-32 should be used where supported.
+// TODO(nluehr): Maybe enable by default after TF32 Ampere testing.
+static std::atomic<bool> tf32_allowed{false};
+
+void allow_tf32_execution(bool allowed) { tf32_allowed = allowed; }
+
+bool tf32_execution_allowed() { return tf32_allowed; }
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/platform/tf32_utils.h b/tensorflow/core/platform/tf32_utils.h
new file mode 100644
index 00000000000..7a158d00ad3
--- /dev/null
+++ b/tensorflow/core/platform/tf32_utils.h
@@ -0,0 +1,27 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_TF32_UTILS_H_
+#define TENSORFLOW_CORE_PLATFORM_TF32_UTILS_H_
+
+namespace tensorflow {
+
+void allow_tf32_execution(bool allowed);
+
+bool tf32_execution_allowed();
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PLATFORM_TF32_UTILS_H_
diff --git a/tensorflow/core/profiler/convert/BUILD b/tensorflow/core/profiler/convert/BUILD
index abf0176bf6f..06594b1aeaf 100644
--- a/tensorflow/core/profiler/convert/BUILD
+++ b/tensorflow/core/profiler/convert/BUILD
@@ -525,6 +525,7 @@ tf_cc_test(
         "//tensorflow/core:test_main",
         "//tensorflow/core/profiler/protobuf:memory_profile_proto_cc",
         "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
+        "//tensorflow/core/profiler/utils:group_events",
         "//tensorflow/core/profiler/utils:xplane_builder",
         "//tensorflow/core/profiler/utils:xplane_schema",
         "//tensorflow/core/profiler/utils:xplane_test_utils",
diff --git a/tensorflow/core/profiler/convert/op_stats_to_overview_page.cc b/tensorflow/core/profiler/convert/op_stats_to_overview_page.cc
index 96bbcc24fff..a92902b6cf7 100644
--- a/tensorflow/core/profiler/convert/op_stats_to_overview_page.cc
+++ b/tensorflow/core/profiler/convert/op_stats_to_overview_page.cc
@@ -130,11 +130,13 @@ void SetCommonRecommendation(absl::string_view input_classification,
                              absl::string_view output_statement,
                              HardwareType hardware_type,
                              absl::string_view tf_function_statement_html,
+                             absl::string_view eager_statement_html,
                              OverviewPageRecommendation* re) {
   re->set_bottleneck(std::string(input_classification));
   re->set_statement(std::string(input_statement));
   re->set_output_statement(std::string(output_statement));
   re->set_tf_function_statement_html(std::string(tf_function_statement_html));
+  re->set_eager_statement_html(std::string(eager_statement_html));
   ComputeHostTips(re);
   ComputeDeviceTips(hardware_type, re);
   ComputeDocumentationTips(re);
@@ -188,13 +190,26 @@ OverviewPageAnalysis ComputeAnalysisResult(const OpStats& op_stats) {
       SafeDivide(
           op_stats.device_op_metrics_db().precision_stats().compute_32bit_ps(),
           total_device_compute_ps));
+
   uint64 num_host_tf_ops = 0;
+  uint64 total_host_op_time_ps_exclude_idle = 0;
+  uint64 eager_host_op_time_ps = 0;
   for (const OpMetrics& metrics : op_stats.host_op_metrics_db().metrics_db()) {
     num_host_tf_ops += metrics.occurrences();
+    if (!IsIdleOp(metrics)) {
+      total_host_op_time_ps_exclude_idle += metrics.self_time_ps();
+      if (metrics.is_eager()) eager_host_op_time_ps += metrics.self_time_ps();
+    }
   }
   uint64 num_device_tf_ops = 0;
+  uint64 total_device_op_time_ps_exclude_idle = 0;
+  uint64 eager_device_op_time_ps = 0;
   for (const OpMetrics& metrics : device_tf_op_metrics_db.metrics_db()) {
     num_device_tf_ops += metrics.occurrences();
+    if (!IsIdleOp(metrics)) {
+      total_device_op_time_ps_exclude_idle += metrics.self_time_ps();
+      if (metrics.is_eager()) eager_device_op_time_ps += metrics.self_time_ps();
+    }
   }
   uint64 num_total_tf_ops = num_host_tf_ops + num_device_tf_ops;
   analysis.set_host_tf_op_percent(
@@ -202,6 +217,12 @@ OverviewPageAnalysis ComputeAnalysisResult(const OpStats& op_stats) {
   analysis.set_device_tf_op_percent(
       100.0 * SafeDivide(num_device_tf_ops, num_total_tf_ops));
   analysis.set_host_trace_level(op_stats.run_environment().host_trace_level());
+  analysis.set_host_op_time_eager_percent(
+      100.0 *
+      SafeDivide(eager_host_op_time_ps, total_host_op_time_ps_exclude_idle));
+  analysis.set_device_op_time_eager_percent(
+      100.0 * SafeDivide(eager_device_op_time_ps,
+                         total_device_op_time_ps_exclude_idle));
   return analysis;
 }
 
@@ -279,6 +300,22 @@ std::string TfFunctionRecommendationHtml(const TfFunctionDb& tf_function_db) {
                       ") due to either retracing or eager execution.");
 }
 
+std::string EagerRecommendationHtml(double host_op_time_eager_percent,
+                                    double device_op_time_eager_percent) {
+  std::string recommendation = "";
+  if (host_op_time_eager_percent > kEagerReportThresholdInPercent)
+    absl::StrAppend(&recommendation, host_op_time_eager_percent,
+                    "% of Op time on the host used eager execution. ");
+  if (device_op_time_eager_percent > kEagerReportThresholdInPercent)
+    absl::StrAppend(&recommendation, device_op_time_eager_percent,
+                    "% of Op time on the device used eager execution. ");
+  if (!recommendation.empty())
+    absl::StrAppend(&recommendation, "Performance could be improved with ",
+                    AnchorElement("https://www.tensorflow.org/guide/function",
+                                  "tf.function."));
+  return recommendation;
+}
+
 OverviewPage ConvertOpStatsToOverviewPage(const OpStats& op_stats,
                                           HardwareType hardware_type) {
   OverviewPage overview_page;
@@ -295,6 +332,9 @@ OverviewPage ConvertOpStatsToOverviewPage(const OpStats& op_stats,
   SetCommonRecommendation(
       bottleneck.input_classification(), bottleneck.input_statement(), "",
       hardware_type, TfFunctionRecommendationHtml(op_stats.tf_function_db()),
+      EagerRecommendationHtml(
+          overview_page.analysis().host_op_time_eager_percent(),
+          overview_page.analysis().device_op_time_eager_percent()),
       overview_page.mutable_recommendation());
   PopulateOverviewDiagnostics(op_stats, overview_page.mutable_diagnostics());
   return overview_page;
diff --git a/tensorflow/core/profiler/convert/op_stats_to_overview_page.h b/tensorflow/core/profiler/convert/op_stats_to_overview_page.h
index 098185b8feb..0d49ae492fc 100644
--- a/tensorflow/core/profiler/convert/op_stats_to_overview_page.h
+++ b/tensorflow/core/profiler/convert/op_stats_to_overview_page.h
@@ -32,11 +32,17 @@ namespace profiler {
 // the tf-functions profiled.
 const double kTfFunctionReportThresholdInPercent = 20;
 
+// Reports eager-mode optimization opportunity in the Overview Page if the
+// percent of Op time on host (or device) that is spent on eager mode is over
+// this threshold.
+const double kEagerReportThresholdInPercent = 10;
+
 void SetCommonRecommendation(absl::string_view input_classification,
                              absl::string_view input_statement,
                              absl::string_view output_statement,
                              HardwareType hardware_type,
                              absl::string_view tf_function_statement_html,
+                             absl::string_view eager_statement_html,
                              OverviewPageRecommendation* re);
 
 OverviewPageRecommendation ComputeGenericRecommendation(
@@ -54,6 +60,10 @@ OverviewPage ConvertOpStatsToOverviewPage(const OpStats& op_stats,
 // Returns a html which provides tf-function related recommendation.
 std::string TfFunctionRecommendationHtml(const TfFunctionDb& tf_function_db);
 
+// Returns a html which provides eager-mode related recommendation.
+std::string EagerRecommendationHtml(double host_op_time_eager_percent,
+                                    double device_op_time_eager_percent);
+
 }  // namespace profiler
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/profiler/convert/xplane_to_memory_profile.cc b/tensorflow/core/profiler/convert/xplane_to_memory_profile.cc
index d039ca8da32..d7104c2bbf5 100644
--- a/tensorflow/core/profiler/convert/xplane_to_memory_profile.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_memory_profile.cc
@@ -42,6 +42,8 @@ namespace profiler {
 
 namespace {
 
+constexpr int64 kInvalidStepId = -1;
+
 // Index of the time-sorted memory_profile_snapshots list, and the
 // MemoryActivityMetadata proto it contains.
 using IndexMetaPair = std::pair<int64 /*index*/, const MemoryActivityMetadata*>;
@@ -63,7 +65,7 @@ struct ActivityMetadata {
   int64 allocation_bytes = 0;
   uint64 address = 0;
   absl::string_view tf_op_name;
-  int64 step_id = -1;
+  int64 step_id = kInvalidStepId;
   absl::string_view region_type;
   int64 data_type = 0;
   absl::string_view tensor_shape;
@@ -129,7 +131,6 @@ void UpdateProfileSummary(const AggregationStats& stats, int64 time_offset_ps,
 MemoryProfile GenerateMemoryProfile(const XPlane* host_trace) {
   XPlaneVisitor plane = CreateTfXPlaneVisitor(host_trace);
   MemoryProfile memory_profile;
-  auto* step_count = memory_profile.mutable_step_count();
   // Iterate over all XEvents in the XPlane, and add the XStats to a new
   // MemoryProfileSnapshot if the EventType is kMemoryAllocation or
   // kMemoryDeallocation.
@@ -181,9 +182,8 @@ MemoryProfile GenerateMemoryProfile(const XPlane* host_trace) {
           case StatType::kTfOp:
             metadata.tf_op_name = stat.StrOrRefValue();
             break;
-          case StatType::kStepId:
+          case StatType::kGroupId:
             metadata.step_id = stat.IntValue();
-            if (metadata.step_id != 0) (*step_count)[metadata.step_id]++;
             break;
           case StatType::kRegionType:
             metadata.region_type = stat.StrOrRefValue();
@@ -214,40 +214,21 @@ MemoryProfile GenerateMemoryProfile(const XPlane* host_trace) {
   return memory_profile;
 }
 
-// Sequentialize step ids for the memory profile.
-void UpdateStepId(const tensorflow::protobuf::Map<
-                      tensorflow::protobuf_int64 /*orig_step_id*/,
-                      tensorflow::protobuf_int64 /*count*/>& step_count,
-                  PerAllocatorMemoryProfile* memory_profile) {
-  // Map from original random step id to sequential step id.
-  absl::flat_hash_map<int64 /*orig_step_id*/, int64 /*step_id*/> step_map;
-  constexpr int kUnknownStep = -2;
-  constexpr double kStepFilterRatio = 0.1;  // Magic number for filtering.
-  tensorflow::protobuf_int64 max_step_count = 0;
-  for (const auto& step_and_count : step_count) {
-    max_step_count = std::max(max_step_count, step_and_count.second);
-  }
-  // Filter out noisy and incomplete original step ids.
-  for (const auto& step_and_count : step_count) {
-    if (static_cast<double>(step_and_count.second) / max_step_count >
-        kStepFilterRatio) {
-      step_map[step_and_count.first] = kUnknownStep;
-    }
-  }
-
-  // Update the step ids in memory_profile for this allocator.
-  int64 step_id = -1;
+// Fix invalid step ids of snapshots at the beginning/end of the profile or at
+// the step boundaries. The snapshots with invalid step ids at the beginning get
+// 0 for their step ids. Those at the step boundaries or at the end get the
+// previous snapshot's step id + 1.
+void UpdateStepId(PerAllocatorMemoryProfile* memory_profile) {
+  int64 last_valid_step_id = -1;
+  // Snapshots are already sorted in time.
   for (auto& snapshot : *memory_profile->mutable_memory_profile_snapshots()) {
     DCHECK(snapshot.has_activity_metadata());
-    // Convert the random step id to sequential step id.
-    int64 orig_step_id = snapshot.activity_metadata().step_id();
-    if (step_map.contains(orig_step_id) &&
-        step_map[orig_step_id] == kUnknownStep) {
-      step_map[orig_step_id] = ++step_id;
+    if (snapshot.mutable_activity_metadata()->step_id() == kInvalidStepId) {
+      snapshot.mutable_activity_metadata()->set_step_id(last_valid_step_id + 1);
+    } else {
+      last_valid_step_id = snapshot.mutable_activity_metadata()->step_id();
     }
-    snapshot.mutable_activity_metadata()->set_step_id(step_id);
   }
-  VLOG(2) << "Max sequential step id in profile: " << step_id;
 }
 
 // Update the MemoryActivityMetadata for each deallocation event by copying from
@@ -481,14 +462,14 @@ void ProcessMemoryProfileProto(int64 max_num_snapshots,
       return a.time_offset_ps() < b.time_offset_ps();
     });
 
-    UpdateStepId(memory_profile->step_count(), allocator_memory_profile);
+    UpdateStepId(allocator_memory_profile);
     UpdateDeallocation(allocator_memory_profile);
 
-    int64 peak_bytes_profile = allocator_memory_profile->profile_summary()
-                                   .peak_stats()
-                                   .peak_bytes_in_use();
     int64 peak_step_id =
-        GetPeakMemoryStep(peak_bytes_profile, allocator_memory_profile);
+        GetPeakMemoryStep(allocator_memory_profile->profile_summary()
+                              .peak_stats()
+                              .peak_bytes_in_use(),
+                          allocator_memory_profile);
     ProcessActiveAllocations(peak_step_id, allocator_memory_profile);
     SampleSnapshots(max_num_snapshots, snapshots);
   }
diff --git a/tensorflow/core/profiler/convert/xplane_to_memory_profile.h b/tensorflow/core/profiler/convert/xplane_to_memory_profile.h
index 873ac800aa5..6eddaeeec71 100644
--- a/tensorflow/core/profiler/convert/xplane_to_memory_profile.h
+++ b/tensorflow/core/profiler/convert/xplane_to_memory_profile.h
@@ -25,6 +25,7 @@ namespace profiler {
 
 // Process the host threads XPlane and generate MemoryProfile result; at most
 // max_num_snapshots will be displayed on the UI.
+// REQUIRED: host_plane should have been grouped by calling GroupTfEvents().
 MemoryProfile ConvertXPlaneToMemoryProfile(const XPlane& host_plane,
                                            int64 max_num_snapshots = 1000);
 
diff --git a/tensorflow/core/profiler/convert/xplane_to_memory_profile_test.cc b/tensorflow/core/profiler/convert/xplane_to_memory_profile_test.cc
index 5ddcbcfc75d..c334318dcfe 100644
--- a/tensorflow/core/profiler/convert/xplane_to_memory_profile_test.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_memory_profile_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/protobuf/memory_profile.pb.h"
 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
+#include "tensorflow/core/profiler/utils/group_events.h"
 #include "tensorflow/core/profiler/utils/xplane_builder.h"
 #include "tensorflow/core/profiler/utils/xplane_schema.h"
 #include "tensorflow/core/profiler/utils/xplane_test_utils.h"
@@ -84,11 +85,11 @@ TEST(ConvertXPlaneToMemoryProfile, OneAllocatorMultiActivitiesTest) {
                 {StatType::kRegionType, "temp"},
                 {StatType::kTensorShapes, "[1, 2]"}});
 
+  tensorflow::profiler::GroupTfEvents(&space, nullptr);
   MemoryProfile memory_profile = ConvertXPlaneToMemoryProfile(*host_plane);
   EXPECT_EQ(memory_profile.memory_profile_per_allocator().size(), 1);
   EXPECT_EQ(memory_profile.num_hosts(), 1);
   EXPECT_EQ(memory_profile.memory_ids_size(), 1);
-  EXPECT_EQ(memory_profile.step_count().size(), 1);
   EXPECT_EQ(memory_profile.memory_profile_per_allocator().begin()->first,
             "GPU_0_bfc");
   const auto& allocator_memory_profile =
diff --git a/tensorflow/core/profiler/internal/gpu/cupti_tracer.cc b/tensorflow/core/profiler/internal/gpu/cupti_tracer.cc
index c9c626b7289..931801427e7 100644
--- a/tensorflow/core/profiler/internal/gpu/cupti_tracer.cc
+++ b/tensorflow/core/profiler/internal/gpu/cupti_tracer.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/container/node_hash_map.h"
+#include "absl/container/node_hash_set.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/host_info.h"
@@ -683,7 +684,7 @@ Status CreateAndRecordEvent(CUevent *event, CUstream stream) {
 // Note: cuStreamGetCtx only available after CUDA 9.2.
 class ScopedCudaContext {
  public:
-  ScopedCudaContext(CUstream stream) : stream_(stream) {
+  explicit ScopedCudaContext(CUstream stream) : stream_(stream) {
     CuptiApiTracingDisabler disabler;  // don't trace cuda call in this func.
     CUcontext context;
     if (cuStreamGetCtx(stream, &context) != CUDA_SUCCESS) return;
@@ -1244,7 +1245,7 @@ class CuptiDriverApiHookWithCudaEvent : public CuptiDriverApiHook {
   // However there is no guarantee that we receive such callbacks in pairs, we
   // maintain a on-going API calls to make sure no memory leaks.
   struct CuptiApiCallbackContext {
-    CuptiApiCallbackContext(std::vector<uint32> &&r)
+    explicit CuptiApiCallbackContext(std::vector<uint32> &&r)
         : record_indices(std::move(r)) {}
     std::vector<uint32> record_indices;
   };
@@ -1252,7 +1253,7 @@ class CuptiDriverApiHookWithCudaEvent : public CuptiDriverApiHook {
   const CuptiTracerOptions option_;
   CuptiInterface *cupti_interface_;
   CuptiTraceCollector *collector_;
-  std::set<CuptiApiCallbackContext *> callback_contexts_;
+  absl::node_hash_set<CuptiApiCallbackContext *> callback_contexts_;
   std::vector<std::unique_ptr<CudaEventRecorder>> cuda_event_recorders_;
   TF_DISALLOW_COPY_AND_ASSIGN(CuptiDriverApiHookWithCudaEvent);
 };
diff --git a/tensorflow/core/profiler/internal/gpu/device_tracer.cc b/tensorflow/core/profiler/internal/gpu/device_tracer.cc
index 9c3e2d67bf0..3c0ac04caf2 100644
--- a/tensorflow/core/profiler/internal/gpu/device_tracer.cc
+++ b/tensorflow/core/profiler/internal/gpu/device_tracer.cc
@@ -241,7 +241,7 @@ class CuptiTraceCollectorImpl : public CuptiTraceCollector {
   std::string ReportDroppedEvents() {
     absl::MutexLock lock(&mutex_);
     string result;
-    for (const auto dropped : dropped_events_) {
+    for (const auto& dropped : dropped_events_) {
       absl::StrAppend(&result, " ", dropped.second, " events dropped because ",
                       dropped.first, ";");
     }
diff --git a/tensorflow/core/profiler/lib/connected_traceme.h b/tensorflow/core/profiler/lib/connected_traceme.h
index cbc610af407..b55c4407fe6 100644
--- a/tensorflow/core/profiler/lib/connected_traceme.h
+++ b/tensorflow/core/profiler/lib/connected_traceme.h
@@ -28,6 +28,8 @@ namespace profiler {
 enum class ContextType : int {
   kGeneric,
   kTfExecutor,
+  kSharedBatchScheduler,
+  kPjRt,
 };
 
 /*
diff --git a/tensorflow/core/profiler/protobuf/memory_profile.proto b/tensorflow/core/profiler/protobuf/memory_profile.proto
index 7a5272c60b2..4d492a56255 100644
--- a/tensorflow/core/profiler/protobuf/memory_profile.proto
+++ b/tensorflow/core/profiler/protobuf/memory_profile.proto
@@ -122,7 +122,5 @@ message MemoryProfile {
   // Ids for profiled memory allocators, used to populate memory selection list
   // at front end.
   repeated string memory_ids = 3;
-  // Map of original random int64 step id to the count of memory activity events
-  // assigned with it.
-  map<int64 /*orig_step_id*/, int64 /*count*/> step_count = 4;
+  reserved 4;
 }
diff --git a/tensorflow/core/profiler/protobuf/overview_page.proto b/tensorflow/core/profiler/protobuf/overview_page.proto
index cbef05d4d9f..5621ad92a0d 100644
--- a/tensorflow/core/profiler/protobuf/overview_page.proto
+++ b/tensorflow/core/profiler/protobuf/overview_page.proto
@@ -49,6 +49,12 @@ message OverviewPageAnalysis {
   double device_tf_op_percent = 12;
   // Host trace level.
   uint32 host_trace_level = 13;
+  // Percentage of TF-op execution time on the host (excluding the idle time)
+  // that are in eager mode.
+  double host_op_time_eager_percent = 14;
+  // Percentage of TF-op execution time on the device (excluding the idle time)
+  // that are in eager mode.
+  double device_op_time_eager_percent = 15;
 }
 
 // Overview result for a performance tip to users.
@@ -87,8 +93,11 @@ message OverviewPageRecommendation {
   // A statement for output that recommends the next steps for investigating the
   // bottleneck.
   string output_statement = 9;
+  // A statement that recommends the next steps for investigating eager-mode
+  // related bottleneck (it is a html so that it can link to other tools/docs.)
+  string eager_statement_html = 12;
   // A statement that recommends the next steps for investigating tf-function
-  // related bottleneck (it is a html so that it can link to other tools/docs.
+  // related bottleneck (it is a html so that it can link to other tools/docs.)
   string tf_function_statement_html = 10;
   // A list of tips for improving host performance.
   repeated OverviewPageTip host_tips = 3;
diff --git a/tensorflow/core/profiler/utils/BUILD b/tensorflow/core/profiler/utils/BUILD
index ece58802661..0262c5659b7 100644
--- a/tensorflow/core/profiler/utils/BUILD
+++ b/tensorflow/core/profiler/utils/BUILD
@@ -77,6 +77,17 @@ cc_library(
     ],
 )
 
+tf_cc_test(
+    name = "op_metrics_db_utils_test",
+    srcs = ["op_metrics_db_utils_test.cc"],
+    deps = [
+        ":op_metrics_db_utils",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core/profiler/protobuf:op_metrics_proto_cc",
+    ],
+)
+
 cc_library(
     name = "op_utils",
     srcs = ["op_utils.cc"],
diff --git a/tensorflow/core/profiler/utils/group_events.cc b/tensorflow/core/profiler/utils/group_events.cc
index be8dd506b0c..0772cff7b97 100644
--- a/tensorflow/core/profiler/utils/group_events.cc
+++ b/tensorflow/core/profiler/utils/group_events.cc
@@ -635,10 +635,7 @@ std::vector<InterThreadConnectInfo> CreateInterThreadConnectInfoList() {
        {StatType::kStepId, StatType::kIterNum}},
       {HostEventType::kKernelLaunch,
        HostEventType::kKernelExecute,
-       {StatType::kCorrelationId}},
-      {HostEventType::kLocalExecutableExecuteOnLocalDevice,
-       HostEventType::kLocalExecutableExecute,
-       {StatType::kRunId}}};
+       {StatType::kCorrelationId}}};
   return connect_info_list;
 }
 
diff --git a/tensorflow/core/profiler/utils/op_metrics_db_utils_test.cc b/tensorflow/core/profiler/utils/op_metrics_db_utils_test.cc
new file mode 100644
index 00000000000..12c68426b2e
--- /dev/null
+++ b/tensorflow/core/profiler/utils/op_metrics_db_utils_test.cc
@@ -0,0 +1,46 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/profiler/utils/op_metrics_db_utils.h"
+
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/profiler/protobuf/op_metrics.pb.h"
+
+namespace tensorflow {
+namespace profiler {
+namespace {
+
+constexpr double kMaxError = 1E-10;
+
+TEST(OpMetricsDbTest, IdleTimeRatio) {
+  OpMetricsDb metrics_db_0;
+  metrics_db_0.set_total_time_ps(100000000);
+  metrics_db_0.set_total_op_time_ps(60000000);
+  EXPECT_NEAR(0.4, IdleTimeRatio(metrics_db_0), kMaxError);
+
+  OpMetricsDb metrics_db_1;
+  metrics_db_1.set_total_time_ps(200000000);
+  metrics_db_1.set_total_op_time_ps(150000000);
+  EXPECT_NEAR(0.25, IdleTimeRatio(metrics_db_1), kMaxError);
+
+  OpMetricsDb metrics_db_2;
+  metrics_db_1.set_total_time_ps(0);
+  metrics_db_1.set_total_op_time_ps(0);
+  EXPECT_NEAR(1.0, IdleTimeRatio(metrics_db_2), kMaxError);
+}
+
+}  // namespace
+}  // namespace profiler
+}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/utils/xplane_schema.cc b/tensorflow/core/profiler/utils/xplane_schema.cc
index 197dab75d3b..5ca8326d72c 100644
--- a/tensorflow/core/profiler/utils/xplane_schema.cc
+++ b/tensorflow/core/profiler/utils/xplane_schema.cc
@@ -91,10 +91,6 @@ const HostEventTypeMap& GetHostEventTypeMap() {
       {"WhileOp-StartBody", kWhileOpStartBody},
       {"ForOp", kForOp},
       {"PartitionedCallOp", kPartitionedCallOp},
-      // XLA related.
-      {"LocalExecutable::ExecuteOnLocalDevices",
-       kLocalExecutableExecuteOnLocalDevice},
-      {"LocalExecutable::Execute", kLocalExecutableExecute},
       // tf.data related.
       {"IteratorGetNextOp::DoCompute", kIteratorGetNextOp},
       {"IteratorGetNextAsOptionalOp::DoCompute", kIteratorGetNextAsOptionalOp},
@@ -166,6 +162,8 @@ const StatTypeMap& GetStatTypeMap() {
       {"is_eager", kIsEager},
       {"tf_function_call", kTfFunctionCall},
       {"tracing_count", kTfFunctionTracingCount},
+      {"flops", kFlops},
+      {"bytes_accessed", kBytesAccessed},
       // Performance counter related.
       {"Raw Value", kRawValue},
       {"Scaled Value", kScaledValue},
@@ -227,7 +225,8 @@ bool IsInternalStat(absl::optional<int64> stat_type) {
       StatType::kKernelDetails, StatType::kLevel0,
       StatType::kProducerType,  StatType::kProducerId,
       StatType::kConsumerType,  StatType::kConsumerId,
-      StatType::kIsRoot,        StatType::kIsAsync};
+      StatType::kIsRoot,        StatType::kIsAsync,
+      StatType::kFlops,         StatType::kBytesAccessed};
   return stat_type.has_value() && kInternalStats->contains(*stat_type);
 }
 
diff --git a/tensorflow/core/profiler/utils/xplane_schema.h b/tensorflow/core/profiler/utils/xplane_schema.h
index 8b999dc6f9f..41774deaa59 100644
--- a/tensorflow/core/profiler/utils/xplane_schema.h
+++ b/tensorflow/core/profiler/utils/xplane_schema.h
@@ -81,9 +81,6 @@ enum HostEventType {
   kWhileOpStartBody,
   kForOp,
   kPartitionedCallOp,
-  // XLA related.
-  kLocalExecutableExecuteOnLocalDevice,
-  kLocalExecutableExecute,
   // tf.data related.
   kIteratorGetNextOp,
   kIteratorGetNextAsOptionalOp,
@@ -153,6 +150,8 @@ enum StatType {
   kIsEager,
   kTfFunctionCall,
   kTfFunctionTracingCount,
+  kFlops,
+  kBytesAccessed,
   // Performance counter related.
   kRawValue,
   kScaledValue,
diff --git a/tensorflow/core/protobuf/eager_service.proto b/tensorflow/core/protobuf/eager_service.proto
index 3fe2bd486ba..179ef19f805 100644
--- a/tensorflow/core/protobuf/eager_service.proto
+++ b/tensorflow/core/protobuf/eager_service.proto
@@ -258,6 +258,8 @@ message SendPackedHandleOp {
   }
 
   repeated Handle handles = 2;
+
+  string device_name = 3;
 }
 
 ////////////////////////////////////////////////////////////////////////////////
diff --git a/tensorflow/core/protobuf/rewriter_config.proto b/tensorflow/core/protobuf/rewriter_config.proto
index 38c3ad7ae57..9520db92742 100644
--- a/tensorflow/core/protobuf/rewriter_config.proto
+++ b/tensorflow/core/protobuf/rewriter_config.proto
@@ -85,11 +85,15 @@ message RewriterConfig {
   // Enable the swap of kernel implementations based on the device placement
   // (default is ON).
   Toggle implementation_selector = 22;
-  // Optimize data types (default is OFF).
-  // e.g., This will try to use float16 on GPU which is faster.
+  // Optimize data types for CUDA (default is OFF).
+  // This will try to use float16 on GPU which is faster.
   // Note that this can change the numerical stability of the graph and may
   // require the use of loss scaling to maintain model convergence.
   Toggle auto_mixed_precision = 23;
+  // Optimize data types for MKL (default is OFF).
+  // This will try to use bfloat16 on CPUs, which is faster.
+  // Note that this can change the numerical stability of the graph.
+  Toggle auto_mixed_precision_mkl = 25;
   // Disable the entire meta optimizer (off by default).
   bool disable_meta_optimizer = 19;
 
diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index 8e3c66edfc2..52a926c8d8b 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -108,7 +108,7 @@ limitations under the License.
 
 #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
-#define TF_GRAPH_DEF_VERSION 434  // Updated: 2020/6/16
+#define TF_GRAPH_DEF_VERSION 441  // Updated: 2020/6/23
 
 // Checkpoint compatibility versions (the versions field in SavedSliceMeta).
 //
diff --git a/tensorflow/core/tpu/BUILD b/tensorflow/core/tpu/BUILD
index d4bcdfd52c5..aa811f23672 100644
--- a/tensorflow/core/tpu/BUILD
+++ b/tensorflow/core/tpu/BUILD
@@ -103,20 +103,50 @@ cc_library(
         ":libtftpu_header",
         "//tensorflow/c:tf_status",
     ],
+    alwayslink = True,
 )
 
 cc_library(
-    name = "tpu_library_loader",
-    srcs = if_windows(
-        ["tpu_library_loader_windows.cc"],
-        otherwise = ["tpu_library_loader.cc"],
-    ),
-    hdrs = ["tpu_library_loader.h"],
-    visibility = ["//visibility:public"],
+    name = "tpu_api",
+    srcs = ["tpu_api.cc"],
+    hdrs = ["tpu_api.h"],
     deps = [
         ":libtftpu_header",
         ":tpu_config_c_api",
-        "//tensorflow/core/platform:errors",
-        "//tensorflow/core/platform:status",
+        "//tensorflow/core/tpu/kernels:tpu_compile_c_api_hdrs",
+        "//tensorflow/core/tpu/kernels:tpu_mesh_state_c_api_hdrs",
+        "//tensorflow/core/tpu/kernels:tpu_util_c_api_hdrs",
+        "//tensorflow/stream_executor/tpu:tpu_executor_c_api_hdrs",
+        "//tensorflow/stream_executor/tpu:tpu_node_context_c_api_hdrs",
     ],
 )
+
+cc_library(
+    name = "tpu_api_dlsym_initializer",
+    srcs = if_windows(
+        ["tpu_api_dlsym_initializer_windows.cc"],
+        otherwise = ["tpu_api_dlsym_initializer.cc"],
+    ),
+    hdrs = ["tpu_api_dlsym_initializer.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":libtftpu_header",
+        ":tpu_api",
+        ":tpu_config_c_api",
+        ":tpu_library_init_fns",
+        "//tensorflow/core/platform:errors",
+        "//tensorflow/core/platform:status",
+        "//tensorflow/core/tpu/kernels:tpu_compile_c_api_hdrs",
+        "//tensorflow/core/tpu/kernels:tpu_mesh_state_c_api_hdrs",
+        "//tensorflow/core/tpu/kernels:tpu_util_c_api_hdrs",
+        "//tensorflow/stream_executor/tpu:tpu_executor_c_api_hdrs",
+        "//tensorflow/stream_executor/tpu:tpu_node_context_c_api_hdrs",
+        "//tensorflow/stream_executor/tpu:tpu_platform_hdrs",
+    ],
+)
+
+cc_library(
+    name = "tpu_library_init_fns",
+    hdrs = ["tpu_library_init_fns.inc"],
+    visibility = ["//visibility:public"],
+)
diff --git a/tensorflow/core/tpu/kernels/BUILD b/tensorflow/core/tpu/kernels/BUILD
index e7be7d2b062..f69c97b81de 100644
--- a/tensorflow/core/tpu/kernels/BUILD
+++ b/tensorflow/core/tpu/kernels/BUILD
@@ -3,6 +3,10 @@ load(
     "//tensorflow/core/platform:build_config.bzl",
     "tf_proto_library_cc",
 )
+load(
+    "//tensorflow:tensorflow.bzl",
+    "tf_kernel_library",
+)
 
 package(
     default_visibility = [
@@ -12,6 +16,12 @@ package(
     licenses = ["notice"],  # Apache 2.0
 )
 
+tf_kernel_library(
+    name = "kernels",
+    visibility = ["//visibility:public"],
+    deps = [":tpu_configuration_ops"],
+)
+
 cc_library(
     name = "tpu_compile_op_common",
     srcs = ["tpu_compile_op_common.cc"],
@@ -19,9 +29,9 @@ cc_library(
     deps = [
         ":tpu_compile_op_support",
         ":tpu_mesh_state_interface",
+        ":tpu_program_group_interface",
         ":tpu_util",
         ":tpu_util_hdrs",
-        "@com_google_absl//absl/types:span",
         "//tensorflow/compiler/jit:flags",
         "//tensorflow/compiler/jit:shape_inference",
         "//tensorflow/compiler/tf2xla:tf2xla_util",
@@ -30,16 +40,16 @@ cc_library(
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/client:client_library",
         "//tensorflow/compiler/xla/client:compile_only_client",
-        "//tensorflow/core/protobuf/tpu:dynamic_padding_proto_cc",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
-        # "//tensorflow/core/protobuf/tpu:compilation_result_proto_cc",
         "//tensorflow/core/protobuf/tpu:compile_metadata_proto_cc",
+        "//tensorflow/core/protobuf/tpu:dynamic_padding_proto_cc",
         "//tensorflow/core/tpu:tpu_configuration",
         "//tensorflow/core/tpu:tpu_defs",
         "//tensorflow/stream_executor/tpu:tpu_platform_interface",
+        "@com_google_absl//absl/types:span",
     ],
     alwayslink = 1,
 )
@@ -50,7 +60,7 @@ cc_library(
     hdrs = ["tpu_compile_op_options.h"],
 )
 
-cc_library(
+tf_kernel_library(
     name = "tpu_configuration_ops",
     srcs = ["tpu_configuration_ops.cc"],
     hdrs = ["tpu_configuration_ops.h"],
@@ -62,10 +72,10 @@ cc_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/platform:refcount",
+        "//tensorflow/core/tpu:tpu_api",
         "//tensorflow/core/tpu:tpu_config_c_api",
         "//tensorflow/core/tpu:tpu_configuration",
         "//tensorflow/core/tpu:tpu_defs",
-        "//tensorflow/core/tpu:tpu_library_loader",
         "//tensorflow/stream_executor/tpu:proto_helper",
     ],
     alwayslink = 1,
@@ -75,12 +85,13 @@ cc_library(
     name = "tpu_compile_c_api_hdrs",
     hdrs = ["tpu_compile_c_api.h"],
     deps = [
-        ":tpu_mesh_state_c_api",
+        ":tpu_mesh_state_c_api_hdrs",
         ":tpu_ops_common_c_api_hdrs",
         ":tpu_program_c_api_hdrs",
-        "//tensorflow/c:tf_datatype",
+        "//tensorflow/core/tpu:libtftpu_header",
         "//tensorflow/stream_executor/tpu:proto_helper",
     ],
+    alwayslink = True,
 )
 
 tf_proto_library_cc(
@@ -157,14 +168,28 @@ cc_library(
         "tpu_compilation_cache_entry.h",
     ],
     deps = [
+        ":compiled_subgraph",
+        ":tpu_compilation_cache_proto_cc",
         ":tpu_executable_info_proto_cc",
         ":tpu_program_group",
         "//tensorflow/compiler/xla/service:hlo_proto_cc",
+        "//tensorflow/core:framework",
         "//tensorflow/core/lib/core:refcount",
         "//tensorflow/core/platform:casts",
     ],
 )
 
+cc_library(
+    name = "tpu_compilation_cache_entry_impl",
+    srcs = [],
+    hdrs = ["tpu_compilation_cache_entry_impl.h"],
+    deps = [
+        ":compiled_subgraph",
+        ":tpu_compilation_cache_interface",
+        ":tpu_executable_info_proto_cc",
+    ],
+)
+
 cc_library(
     name = "tpu_compilation_cache_lookup",
     srcs = ["tpu_compilation_cache_lookup.cc"],
@@ -174,6 +199,7 @@ cc_library(
     deps = [
         ":tpu_compilation_cache_entry",
         ":tpu_compilation_cache_external",
+        ":tpu_compilation_cache_interface",
         ":tpu_compilation_cache_proto_cc",
         "//tensorflow/core/lib/core:refcount",
         "//tensorflow/core/platform:status",
@@ -182,8 +208,10 @@ cc_library(
 )
 
 cc_library(
-    name = "tpu_mesh_state_c_api",
+    name = "tpu_mesh_state_c_api_hdrs",
     hdrs = ["tpu_mesh_state_c_api.h"],
+    deps = ["//tensorflow/core/tpu:libtftpu_header"],
+    alwayslink = True,
 )
 
 cc_library(
@@ -192,12 +220,11 @@ cc_library(
     hdrs = ["tpu_mesh_state_interface.h"],
     deps = [
         ":tpu_compile_c_api_hdrs",
-        ":tpu_mesh_state_c_api",
+        ":tpu_mesh_state_c_api_hdrs",
         "//tensorflow/compiler/xla/service",
         "//tensorflow/core:framework",
-        "//tensorflow/core/platform:errors",
         "//tensorflow/core/protobuf/tpu:compile_metadata_proto_cc",
-        "//tensorflow/core/tpu:tpu_config_c_api",
+        "//tensorflow/core/tpu:tpu_api",
     ],
 )
 
@@ -247,6 +274,35 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "tpu_compilation_cache_interface",
+    srcs = ["tpu_compilation_cache_interface.cc"],
+    hdrs = ["tpu_compilation_cache_interface.h"],
+    deps = [
+        ":compiled_subgraph",
+        ":tpu_compilation_cache_key",
+        ":tpu_compilation_cache_metrics_hdrs",
+        ":tpu_compilation_cache_proto_cc",
+        ":tpu_util",
+        ":tpu_util_hdrs",
+        ":trace_util_hdrs",
+        "//tensorflow/compiler/tf2xla:host_compute_metadata_proto_cc",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/distributed_runtime/rpc:grpc_call",
+        "//tensorflow/core/platform:casts",  # buildcleaner: keep
+        "//tensorflow/core/profiler/lib:traceme",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/container:node_hash_map",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/synchronization",
+    ],
+    alwayslink = 1,
+)
+
 cc_library(
     name = "tpu_compilation_cache_external",
     srcs = ["tpu_compilation_cache_external.cc"],
@@ -256,6 +312,8 @@ cc_library(
     deps = [
         ":compiled_subgraph",
         ":tpu_compilation_cache_entry",
+        ":tpu_compilation_cache_entry_impl",
+        ":tpu_compilation_cache_interface",
         ":tpu_compilation_cache_key",
         ":tpu_compilation_cache_metrics",  # buildcleaner: keep
         ":tpu_compilation_cache_metrics_hdrs",
@@ -263,6 +321,7 @@ cc_library(
         ":tpu_compile_c_api_hdrs",
         ":tpu_compile_op_support",
         ":tpu_mesh_state_interface",
+        ":tpu_op_consts",
         ":tpu_program_group",
         ":tpu_util",
         ":trace_util_hdrs",
@@ -325,13 +384,16 @@ cc_library(
     name = "tpu_util_c_api_hdrs",
     hdrs = ["tpu_util_c_api.h"],
     deps = [
+        "//tensorflow/core/tpu:libtftpu_header",
         "//tensorflow/stream_executor/tpu:proto_helper",
     ],
+    alwayslink = True,
 )
 
 cc_library(
     name = "tpu_ops_common_c_api_hdrs",
     hdrs = ["tpu_ops_common_c_api.h"],
+    alwayslink = True,
 )
 
 cc_library(
@@ -341,6 +403,23 @@ cc_library(
         ":tpu_ops_common_c_api_hdrs",
         "//tensorflow/stream_executor/tpu:proto_helper",
     ],
+    alwayslink = True,
+)
+
+cc_library(
+    name = "tpu_op_util",
+    srcs = ["tpu_op_util.cc"],
+    hdrs = ["tpu_op_util.h"],
+    deps = [
+        ":tpu_compilation_cache_key",
+        ":tpu_compile_c_api_hdrs",
+        ":tpu_mesh_state_interface",
+        "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/protobuf/tpu:compile_metadata_proto_cc",
+        "@com_google_absl//absl/strings",
+    ],
 )
 
 cc_library(
@@ -365,3 +444,29 @@ tf_proto_library_cc(
     srcs = ["tpu_compilation_cache.proto"],
     cc_api_version = 2,
 )
+
+cc_library(
+    name = "tpu_compile_op_hdrs",
+    hdrs = ["tpu_compile_op.h"],
+    deps = ["//tensorflow/core:framework"],
+)
+
+cc_library(
+    name = "tpu_compilation_cache_entry_unloader",
+    hdrs = ["tpu_compilation_cache_entry_unloader.h"],
+    deps = [
+        ":tpu_compilation_cache_interface",
+        "//tensorflow/core:framework",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/synchronization",
+    ],
+)
+
+cc_library(
+    name = "tpu_op_consts",
+    srcs = ["tpu_op_consts.cc"],
+    hdrs = ["tpu_op_consts.h"],
+    deps = [
+        "@com_google_absl//absl/base:core_headers",
+    ],
+)
diff --git a/tensorflow/core/tpu/kernels/compiled_subgraph.h b/tensorflow/core/tpu/kernels/compiled_subgraph.h
index 1066e4839dd..a97c652c279 100644
--- a/tensorflow/core/tpu/kernels/compiled_subgraph.h
+++ b/tensorflow/core/tpu/kernels/compiled_subgraph.h
@@ -25,6 +25,9 @@ limitations under the License.
 namespace tensorflow {
 namespace tpu {
 
+// Forward declaration to avoid circular dependency.
+class TpuCompilationCacheInterface;
+
 // Cache for compiled TPU program.
 //
 // Each key identifies a unique subgraph, and the value is the vector of
@@ -100,10 +103,7 @@ namespace tpu {
 // unmarked and set to most recently used.
 //
 struct CompiledSubgraph : public core::RefCounted {
-  // TODO(henrytan): once `TpuCompilationCache` and
-  // `TpuCompilationCacheExternal` inherits from `TpuCompilationCacheInterface`
-  // update void* with `TpuCompilationCacheInterface`
-  void* parent = nullptr;  // Not owned.
+  TpuCompilationCacheInterface* parent = nullptr;  // Not owned.
 
   bool initialized = false;
 
@@ -145,7 +145,7 @@ struct CompiledSubgraph : public core::RefCounted {
   // owning main entry.
   CompiledSubgraph* main_entry = nullptr;
 
-  // Compiled Tpu program.
+  // Compiled TPU program group.
   std::unique_ptr<TpuProgramGroupInterface> tpu_program_group;
 
   // Computes total program size.
diff --git a/tensorflow/core/tpu/kernels/tpu_compilation_cache_entry.cc b/tensorflow/core/tpu/kernels/tpu_compilation_cache_entry.cc
index 4d1f306ec0c..73f55853306 100644
--- a/tensorflow/core/tpu/kernels/tpu_compilation_cache_entry.cc
+++ b/tensorflow/core/tpu/kernels/tpu_compilation_cache_entry.cc
@@ -40,7 +40,7 @@ TpuCompilationCacheEntry::get_host_transfer_info() const {
 }
 
 const xla::HloProto* TpuCompilationCacheEntry::get_hlo_metadata() const {
-  return tpu_program_group_->hlo_metadatas()[core_index_].get();
+  return tpu_program_group_->hlo_metadatas()[core_index_];
 }
 
 // TODO(henrytan,jiawenhao): When should we expect more than one
diff --git a/tensorflow/core/tpu/kernels/tpu_compilation_cache_entry.h b/tensorflow/core/tpu/kernels/tpu_compilation_cache_entry.h
index a561fc51778..b3766b8b4dd 100644
--- a/tensorflow/core/tpu/kernels/tpu_compilation_cache_entry.h
+++ b/tensorflow/core/tpu/kernels/tpu_compilation_cache_entry.h
@@ -23,7 +23,7 @@ limitations under the License.
 namespace tensorflow {
 namespace tpu {
 
-// A version of `CompilationCacheEntry` that exposes Tpu binary program
+// A version of `CompilationCacheEntry` to access Tpu binary program
 // `XLA_TpuProgram`.
 class TpuCompilationCacheEntry {
  public:
@@ -42,28 +42,6 @@ class TpuCompilationCacheEntry {
   int core_index_;
 };
 
-// Base class for a reference to a cached proto. A unique_ptr to a
-// CompilationCacheEntryRef is returned by all the cache Lookup methods below,
-// and ensures the underlying proto is not garbage-collected until the client
-// discards the ptr.
-class CompilationCacheEntryRef {
- public:
-  virtual ~CompilationCacheEntryRef() = default;
-
-  // Returns a CompilationCacheEntry that should not be used beyond the lifetime
-  // of the CompilationCacheEntryRef.
-  virtual TpuCompilationCacheEntry get() = 0;
-};
-
-// Base class that holds references to compiled protos so that the protos are
-// not garbage-collected before being used by execute ops. Use
-// TpuCompilationCache::MakePerStepRefHolder to create an instance of a concrete
-// ref holder object.
-class CompilationRefHolder : public ResourceBase {
- public:
-  ~CompilationRefHolder() override = default;
-};
-
 }  // namespace tpu
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/tpu/kernels/tpu_compilation_cache_entry_impl.h b/tensorflow/core/tpu/kernels/tpu_compilation_cache_entry_impl.h
new file mode 100644
index 00000000000..501f802b01f
--- /dev/null
+++ b/tensorflow/core/tpu/kernels/tpu_compilation_cache_entry_impl.h
@@ -0,0 +1,108 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILATION_CACHE_ENTRY_IMPL_H_
+#define TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILATION_CACHE_ENTRY_IMPL_H_
+
+#include "tensorflow/core/tpu/kernels/compiled_subgraph.h"
+#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.h"
+#include "tensorflow/core/tpu/kernels/tpu_executable_info.pb.h"
+
+namespace tensorflow {
+namespace tpu {
+
+// Wrapper for a cache entry that holds a reference to the entry until the
+// wrapper is deleted. This wrapper is the concrete type of
+// CompilationCacheEntryRef returned by Lookup.
+template <typename CacheEntryType>
+class CompilationCacheEntryRefImpl
+    : public CompilationCacheEntryRef<CacheEntryType> {
+ public:
+  CompilationCacheEntryRefImpl(TpuCompilationCacheInterface* parent,
+                               CompiledSubgraph* entry, int index);
+
+  ~CompilationCacheEntryRefImpl() override;
+
+  Status ToSubEntryRef(CompilationCacheFetchTarget fetch_target) override;
+
+ protected:
+  TpuCompilationCacheInterface* parent_;  // Not owned.
+  // A reference to entry_ is acquired in the constructor and released via
+  // parent->DiscardEntryRefs in the destructor.
+  CompiledSubgraph* entry_;
+  // The index of the program in entry_ that is returned by the get method.
+  int index_;
+};
+
+template <typename CacheEntryType>
+CompilationCacheEntryRefImpl<CacheEntryType>::CompilationCacheEntryRefImpl(
+    TpuCompilationCacheInterface* parent, CompiledSubgraph* entry, int index)
+    : parent_(parent), entry_(entry), index_(index) {
+  if (entry_ == nullptr) {
+    return;
+  }
+  if (entry_->main_entry == nullptr) {
+    entry_->Ref();
+  } else {
+    // This is a sharding/unsharding entry nested in a main entry. Only
+    // refcount the main entry.
+    entry_->main_entry->Ref();
+  }
+}
+
+template <typename CacheEntryType>
+CompilationCacheEntryRefImpl<CacheEntryType>::~CompilationCacheEntryRefImpl() {
+  if (entry_ == nullptr) {
+    return;
+  }
+  if (entry_->main_entry == nullptr) {
+    parent_->DiscardEntryRefs({entry_});
+  } else {
+    parent_->DiscardEntryRefs({entry_->main_entry});
+  }
+}
+
+template <typename CacheEntryType>
+Status CompilationCacheEntryRefImpl<CacheEntryType>::ToSubEntryRef(
+    CompilationCacheFetchTarget fetch_target) {
+  CompiledSubgraph* target = nullptr;
+  switch (fetch_target) {
+    case CompilationCacheFetchTarget::MAIN:
+      target = entry_;
+      break;
+    case CompilationCacheFetchTarget::SHARDING:
+      target = entry_->sharding_entry.get();
+      break;
+    case CompilationCacheFetchTarget::UNSHARDING:
+      target = entry_->unsharding_entry.get();
+      break;
+    default:
+      return xla::InvalidArgument("Invalid fetch target: %d", fetch_target);
+  }
+
+  if (target == nullptr) {
+    // Cache entry does not have an unsharding subentry. Unref and replace
+    // with nullptr.
+    parent_->DiscardEntryRefs({entry_});
+  }
+  // Otherwise, since the refcount is always on the main entry, we don't
+  // need ref/unref.
+  entry_ = target;
+  return Status::OK();
+}
+
+}  // namespace tpu
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILATION_CACHE_ENTRY_IMPL_H_
diff --git a/tensorflow/core/tpu/kernels/tpu_compilation_cache_entry_unloader.h b/tensorflow/core/tpu/kernels/tpu_compilation_cache_entry_unloader.h
new file mode 100644
index 00000000000..c298d8fcc12
--- /dev/null
+++ b/tensorflow/core/tpu/kernels/tpu_compilation_cache_entry_unloader.h
@@ -0,0 +1,69 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TPU_KERNELS_COMPILATION_CACHE_ENTRY_UNLOADER_H_
+#define TENSORFLOW_CORE_TPU_KERNELS_COMPILATION_CACHE_ENTRY_UNLOADER_H_
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/synchronization/mutex.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.h"
+
+namespace tensorflow {
+namespace tpu {
+
+class TpuCompilationCacheEntryUnloader : public ResourceBase {
+ public:
+  explicit TpuCompilationCacheEntryUnloader(TpuCompilationCacheInterface* cache)
+      : cache_(cache) {
+    // Hold a reference to the cache until the unloader is destroyed.
+    cache_->Ref();
+    VLOG(1) << "Will unload compilation cache entries when session closes.";
+  }
+
+  ~TpuCompilationCacheEntryUnloader() override {
+    absl::MutexLock lock(&mu_);
+    for (int64 uid : cache_entry_uids_) {
+      Status s = cache_->MarkEntryForEviction(uid);
+      if (!s.ok()) {
+        LOG(WARNING) << "MarkEntryForEviction in "
+                        "~CompilationCacheEntryUnloader fails with error "
+                     << s;
+      }
+    }
+    // Release our reference to the cache.
+    cache_->Unref();
+  }
+
+  // Add cache entry uid to be unloaded in destructor.
+  void AddCacheEntryUid(int64 uid) {
+    absl::MutexLock lock(&mu_);
+    cache_entry_uids_.insert(uid);
+  }
+
+  std::string DebugString() const override {
+    return "CompilationCacheEntryUnloader";
+  }
+
+ private:
+  TF_DISALLOW_COPY_AND_ASSIGN(TpuCompilationCacheEntryUnloader);
+  mutable absl::Mutex mu_;
+  TpuCompilationCacheInterface* cache_;  // Not owned.
+  absl::flat_hash_set<int64> cache_entry_uids_ ABSL_GUARDED_BY(mu_);
+};
+
+}  // namespace tpu
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TPU_KERNELS_COMPILATION_CACHE_ENTRY_UNLOADER_H_
diff --git a/tensorflow/core/tpu/kernels/tpu_compilation_cache_external.cc b/tensorflow/core/tpu/kernels/tpu_compilation_cache_external.cc
index 614dfbdf577..c4442fc95d5 100644
--- a/tensorflow/core/tpu/kernels/tpu_compilation_cache_external.cc
+++ b/tensorflow/core/tpu/kernels/tpu_compilation_cache_external.cc
@@ -49,159 +49,34 @@ void PopulateEntry(const std::string& key, CompiledSubgraph* entry,
       absl::make_unique<TpuProgramGroup>(std::move(tpu_program_group));
   entry->initialized = true;
 }
-
-std::string ConstructCompilationCacheKey(const TpuCompilationCacheKey& key) {
-  if (!key.has_guaranteed_const) {
-    return key.prefix;
-  }
-  return absl::StrCat(key.prefix, "|", key.session_handle, "|",
-                      key.guaranteed_const_fingerprint());
-}
-
-// Return fingerprint_in_metadata if it's not empty; otherwise read input tensor
-// data to compute the fingerprint.
-std::string GuaranteedConstFingerprint(
-    const string& fingerprint_in_metadata,
-    const OpInputList& guaranteed_constants) {
-  if (fingerprint_in_metadata.empty()) {
-    uint64_t fingerprint = 0;
-    for (const auto& constant : guaranteed_constants) {
-      fingerprint = TpuCompile_CreateGuaranteedConstFingerprint(
-          fingerprint, constant.tensor_data().data(),
-          constant.tensor_data().size());
-    }
-    return std::to_string(fingerprint);
-  } else {
-    return fingerprint_in_metadata;
-  }
-}
-
-std::string CreateShapePrefix(
-    const std::vector<tensorflow::TensorShape>& dynamic_shapes) {
-  std::string shapes_prefix;
-  for (const TensorShape& shape : dynamic_shapes) {
-    for (int64 size : shape.dim_sizes()) {
-      absl::StrAppend(&shapes_prefix, size, ",");
-    }
-    absl::StrAppend(&shapes_prefix, ";");
-  }
-  return shapes_prefix;
-}
-
-// Include compilation configurations of the arguments that are not captured
-// by the called graph.
-std::string CreateConfigPrefix(const TPUCompileMetadataProto& metadata) {
-  std::string config_prefix;
-  for (const auto& arg : metadata.args()) {
-    if (arg.is_same_data_across_replicas()) {
-      absl::StrAppend(&config_prefix, ":s");
-      // Same.
-    } else {
-      // Different.
-      absl::StrAppend(&config_prefix, ":");
-    }
-    if (arg.enable_xla_sharding() ==
-        tpu::TPUCompileMetadataProto::Arg::ALLOWED) {
-      // Enabled.
-      absl::StrAppend(&config_prefix, "e");
-    }
-    if (arg.unrestricted_layout()) {
-      // Unrestricted.
-      absl::StrAppend(&config_prefix, ":u");
-    }
-    absl::StrAppend(&config_prefix, ",type(", arg.dtype(), ")");
-    if (arg.has_shape()) {
-      absl::StrAppend(&config_prefix, ",shape(");
-      for (const auto& dim : arg.shape().dim()) {
-        absl::StrAppend(&config_prefix, dim.size(), ",");
-      }
-      absl::StrAppend(&config_prefix, ")");
-    }
-  }
-  return config_prefix;
-}
-
 }  // namespace
 
-TpuCompilationCacheExternal::TpuCompilationCacheExternal(int64_t max_cache_size)
-    : max_cache_size_(max_cache_size) {
-  if (max_cache_size < 0) {
-    LOG(FATAL) << "`max_cache_size` value must be greater than equal to 0";
-  }
-  VLOG(1) << "Created compilation cache size " << max_cache_size_ << " bytes.";
-}
+TpuCompilationCacheExternal::EntryRefImpl::EntryRefImpl(
+    TpuCompilationCacheInterface* parent, CompiledSubgraph* entry, int index)
+    : CompilationCacheEntryRefImpl<TpuCompilationCacheEntry>(parent, entry,
+                                                             index) {}
 
-TpuCompilationCacheExternal::~TpuCompilationCacheExternal() {
-  VLOG(1) << "TpuCompilationCacheExternal::~TpuCompilationCacheExternal()";
-  // A buggy client may be holding onto a reference, or a client might have
-  // crashed while holding onto a reference. In either case, discard all
-  // outstanding client references to avoid leaking storage.
-  for (const auto& entry : entries_by_uid_) {
-    while (entry.second->external_references > 0) {
-      TF_CHECK_OK(Release(entry.first));
-    }
+TpuCompilationCacheEntry TpuCompilationCacheExternal::EntryRefImpl::get() {
+  if (entry_ == nullptr) {
+    // Create an empty entry if the entry is nullptr. This corresponds to
+    // non-existing sharding/unsharding entries.
+    return TpuCompilationCacheEntry();
   }
-  while (!entries_by_last_use_.empty()) {
-    UnloadAndDestroy(MarkOldestEntryForEviction());
-  }
-  // By the time the cache is deleted all reference holders should have already
-  // been deleted, since they were holding references to the cache. So all
-  // entries should be gone at this point.
-  CHECK_EQ(cache_store_.size(), 0);
-  CHECK_EQ(entries_by_uid_.size(), 0);
-  CHECK_EQ(entries_by_proto_key_.size(), 0);
-  CHECK_EQ(cache_size_, 0);
-  CHECK_EQ(marked_for_eviction_size_, 0);
-}
-
-std::string TpuCompilationCacheExternal::FindCacheKey(
-    const TpuCompilationCacheKey& subgraph_key) const {
-  if (!subgraph_key.has_guaranteed_const) {
-    return subgraph_key.prefix;
-  }
-  auto iter = session_key_map_.find(
-      strings::StrCat(subgraph_key.prefix, subgraph_key.session_handle));
-  if (iter != session_key_map_.end()) {
-    return iter->second;
-  }
-  iter = fingerprint_key_map_.find(strings::StrCat(
-      subgraph_key.prefix, subgraph_key.guaranteed_const_fingerprint()));
-  if (iter != session_key_map_.end()) {
-    return iter->second;
-  }
-  VLOG(1) << "No matching cache key found for key "
-          << ConstructCompilationCacheKey(subgraph_key);
-  return "";
-}
-
-void TpuCompilationCacheExternal::InsertEntry(
-    const std::string& cache_key, const TpuCompilationCacheKey& subgraph_key,
-    CompiledSubgraph* entry) {
-  entry->parent = this;
-  entry->subgraph_key = cache_key;
-  entry->uid = get_uid();
-  TpuCompilationCacheMetrics::SetCacheEntryCount(cache_store_.size());
-  entry->cache_entry_debug_string = subgraph_key.prefix;
-  VLOG(1) << "Cache Initializing Entry Session Debug "
-          << entry->cache_entry_debug_string;
-
-  if (!subgraph_key.has_guaranteed_const) {
-    return;
-  }
-  session_key_map_.insert(std::make_pair(
-      strings::StrCat(subgraph_key.prefix, subgraph_key.session_handle),
-      cache_key));
-  fingerprint_key_map_.insert(std::make_pair(
-      strings::StrCat(subgraph_key.prefix,
-                      subgraph_key.guaranteed_const_fingerprint()),
-      cache_key));
+  return TpuCompilationCacheEntry(entry_->tpu_program_group.get(), index_);
 }
 
 CompiledSubgraph* TpuCompilationCacheExternal::InitializeEntry(
     const string& key,
-    const std::function<Status(TpuProgramGroup*)>& initialize_program,
+    const std::function<Status(TpuProgramGroupInterface*)>& initialize_program,
     const TpuCompilationCacheKey& subgraph_key) {
   CompiledSubgraph* main_entry = new CompiledSubgraph();
+  main_entry->parent = this;
+  main_entry->subgraph_key = key;
+  main_entry->uid = get_uid();
+  // TODO(henrytan): implement TpuCompilationCacheKey.debug_string.
+  main_entry->cache_entry_debug_string = subgraph_key.prefix;
+  VLOG(1) << "Cache Initializing Entry Session Debug "
+          << main_entry->cache_entry_debug_string;
 
   // Add the entry to the cache, with size zero since there are no compiled
   // programs in it. Once the subgraph has been compiled,
@@ -212,7 +87,7 @@ CompiledSubgraph* TpuCompilationCacheExternal::InitializeEntry(
   // who created the entry. A second reference, owned by the cache, will be
   // added below since we leave the entry in the 'marked for eviction' state
   // here.
-  InsertEntry(key, subgraph_key, main_entry);
+  InsertEntry(key, main_entry);
 
   // Initialize the programs outside the lock so that other cache operations
   // can proceed during the (potentially lengthy) initialization.
@@ -257,533 +132,5 @@ CompiledSubgraph* TpuCompilationCacheExternal::InitializeEntry(
   marked_for_eviction_size_ += main_entry->total_size;
   return main_entry;
 }
-
-/*static*/ TpuCompilationCacheKey
-TpuCompilationCacheExternal::CreateCompilationCacheKey(
-    absl::string_view function_name, uint64 function_library_fingerprint,
-    absl::string_view mlir_module,
-    const tensorflow::OpInputList& guaranteed_constants,
-    const std::vector<tensorflow::TensorShape>& dynamic_shapes,
-    const tensorflow::tpu::TPUCompileMetadataProto& metadata,
-    const TpuMeshStateInterface& mesh_state) {
-  VLOG(1) << "FunctionLibraryFingerprint:" << function_library_fingerprint;
-  std::string shapes_prefix = CreateShapePrefix(dynamic_shapes);
-  VLOG(1) << "shapes_prefix = " << shapes_prefix;
-  std::string config_prefix = CreateConfigPrefix(metadata);
-  VLOG(1) << "config_prefix = " << config_prefix;
-  std::vector<int32_t> flattened_device_ids;
-  if (metadata.has_device_assignment()) {
-    for (const auto& device :
-         metadata.device_assignment().computation_devices()) {
-      flattened_device_ids.insert(flattened_device_ids.end(),
-                                  device.replica_device_ids().begin(),
-                                  device.replica_device_ids().end());
-    }
-  }
-  // TODO(henrytan): return the debug_string.
-  const char* prefix =
-      TpuCompile_CreateCompilationCacheKey(CompilationCacheKeyProperty{
-          config_prefix.data(),
-          shapes_prefix.data(),
-          function_name.data(),
-          mlir_module.data(),
-          flattened_device_ids.data(),
-          flattened_device_ids.size(),
-          guaranteed_constants.size(),
-          function_library_fingerprint,
-          metadata.num_cores_per_replica(),
-          metadata.num_replicas(),
-          mesh_state.data(),
-      });
-  auto buffer_cleanup = gtl::MakeCleanup([prefix]() { delete[] prefix; });
-  TpuCompilationCacheKey key;
-  key.prefix = prefix;
-
-  // Guaranteed constants can be different across sessions. Use session_handle
-  // and guaranteed_const fingerprint to guarantee no collision.
-  if (guaranteed_constants.size() > 0) {
-    key.has_guaranteed_const = true;
-    key.session_handle = metadata.session_handle();
-    // Both `metadata` and `guaranteed_constants` lifetime are captured by
-    // reference based on the assumption that these variables lifetime is
-    // managed through the `TPUCompileOpKernelImpl` that outlives the
-    // lifetime of the compilation cache lookups.
-    string fingerprint;
-    key.guaranteed_const_fingerprint = [&metadata, &guaranteed_constants,
-                                        fingerprint]() mutable {
-      if (fingerprint.empty()) {
-        fingerprint = GuaranteedConstFingerprint(
-            metadata.guaranteed_const_fingerprint(), guaranteed_constants);
-      }
-      return fingerprint;
-    };
-  }
-  return key;
-}
-
-TpuCompilationRefHolder* TpuCompilationCacheExternal::MakePerStepRefHolder() {
-  return new RefHolder(this);
-}
-
-Status TpuCompilationCacheExternal::MarkEntryForEviction(int64 subgraph_uid) {
-  profiler::TraceMe key_release_traceme(
-      "TPU compilation cache possibly evict uid",
-      /*level=*/2);
-  CompiledSubgraph* deleted_entry = nullptr;
-  {
-    absl::MutexLock lock(&mu_);
-    auto iter = entries_by_uid_.find(subgraph_uid);
-    if (iter == entries_by_uid_.end()) {
-      // If already evicted, return ok.
-      return Status::OK();
-    }
-
-    // Mark entry for eviction.
-    CompiledSubgraph* subgraph_to_evict = iter->second;
-    // If there are external references, should not use this API.
-    if (subgraph_to_evict->external_references != 0) {
-      return errors::Internal("Subgraph ", subgraph_to_evict->subgraph_key,
-                              " external_references greater than zero. Should "
-                              "use TpuCompilationCache::Release.");
-    }
-
-    VLOG(1) << "Marking " << subgraph_to_evict->subgraph_key << " for eviction";
-    entries_by_last_use_.erase(subgraph_to_evict->last_use);
-    cache_size_ -= subgraph_to_evict->total_size;
-    marked_for_eviction_size_ += subgraph_to_evict->total_size;
-
-    // Evict if refcount exactly one, otherwise only discard cache's reference
-    // to the entry while the actual eviction will happen when refholder's
-    // references go away.
-    deleted_entry = DiscardEntryRef(subgraph_to_evict);
-
-    VLOG(1) << "After possibly evicting entry " << subgraph_uid
-            << " refs cache is " << cache_store_.size() << " entries ("
-            << cache_size_ + marked_for_eviction_size_
-            << " bytes), marked for eviction "
-            << (cache_store_.size() - entries_by_last_use_.size())
-            << " entries (" << marked_for_eviction_size_ << " bytes).";
-  }
-
-  // Unload from device cache if entry is evicted from host cache.
-  UnloadAndDestroy(deleted_entry);
-  return Status::OK();
-}
-
-Status TpuCompilationCacheExternal::Release(int64 subgraph_uid) {
-  profiler::TraceMe key_release_traceme("TPU compilation cache release uid",
-                                        /*level=*/2);
-
-  CompiledSubgraph* deleted_entry = nullptr;
-  {
-    absl::MutexLock lock(&mu_);
-    auto iter = entries_by_uid_.find(subgraph_uid);
-
-    if (iter == entries_by_uid_.end()) {
-      return errors::NotFound("No cache entry found for uid ", subgraph_uid);
-    }
-
-    CHECK_GT(iter->second->external_references, 0);
-    --iter->second->external_references;
-
-    deleted_entry = DiscardEntryRef(iter->second);
-
-    VLOG(1) << "After releasing entry " << subgraph_uid << " refs cache is "
-            << cache_store_.size() << " entries ("
-            << cache_size_ + marked_for_eviction_size_
-            << " bytes), marked for eviction "
-            << (cache_store_.size() - entries_by_last_use_.size())
-            << " entries (" << marked_for_eviction_size_ << " bytes).";
-  }
-  UnloadAndDestroy(deleted_entry);
-  return Status::OK();
-}
-
-void TpuCompilationCacheExternal::UnloadAndDestroy(CompiledSubgraph* entry) {
-  if (!entry) return;
-
-  CHECK(entry->RefCountIsOne());
-  entry->tpu_program_group->UnloadAndDestroyPrograms();
-  entry->Unref();
-}
-
-size_t TpuCompilationCacheExternal::RemoveEntry(const string& key) {
-  auto erased = cache_store_.erase(key);
-  TpuCompilationCacheMetrics::SetCacheEntryCount(cache_store_.size());
-  auto parsed_key_or_status = ParseCompilationCacheKey(key);
-  CHECK(parsed_key_or_status.status().ok());
-  const TpuCompilationCacheKey parsed_key =
-      parsed_key_or_status.ConsumeValueOrDie();
-  if (!parsed_key.has_guaranteed_const) {
-    return erased;
-  }
-  session_key_map_.erase(
-      strings::StrCat(parsed_key.prefix, parsed_key.session_handle));
-  fingerprint_key_map_.erase(strings::StrCat(
-      parsed_key.prefix, parsed_key.guaranteed_const_fingerprint()));
-  return erased;
-}
-
-ABSL_MUST_USE_RESULT CompiledSubgraph*
-TpuCompilationCacheExternal::DiscardEntryRef(CompiledSubgraph* entry) {
-  if (entry->RefCountIsOne()) {
-    // The last reference to this entry is going away, so really delete it from
-    // the cache in such a way that it can't be restored by being looked up
-    // again.
-
-    // Sanity-check that it has been marked for eviction.
-    CHECK(entries_by_last_use_.find(entry->last_use) ==
-          entries_by_last_use_.end());
-    // Update the counter tracking how much space is taken up by entries that
-    // are marked for eviction.
-    marked_for_eviction_size_ -= entry->total_size;
-
-    // Remove the entry from the cache.
-    auto erased = RemoveEntry(entry->subgraph_key);
-
-    if (erased == 0) {
-      LOG(FATAL) << "Tried to discard nonexistent cache entry";
-    }
-    erased = entries_by_uid_.erase(entry->uid);
-    CHECK_EQ(erased, 1);
-    for (const string& key : entry->proto_key) {
-      erased = entries_by_proto_key_.erase(key);
-      CHECK_EQ(erased, 1);
-    }
-    // The actual deletion will happen outside the lock in UnloadAndDestroy().
-    return entry;
-  }
-  entry->Unref();
-  return nullptr;
-}
-
-void TpuCompilationCacheExternal::DiscardEntryRefs(
-    gtl::ArraySlice<CompiledSubgraph*> entries) {
-  std::vector<CompiledSubgraph*> removed_entries;
-  {
-    absl::MutexLock lock(&mu_);
-
-    for (auto entry : entries) {
-      removed_entries.push_back(DiscardEntryRef(entry));
-    }
-
-    VLOG(1) << "After discarding entry refs cache is " << cache_store_.size()
-            << " entries (" << cache_size_ + marked_for_eviction_size_
-            << " bytes), marked for eviction "
-            << (cache_store_.size() - entries_by_last_use_.size())
-            << " entries (" << marked_for_eviction_size_ << " bytes).";
-  }
-  for (auto removed_entry : removed_entries) {
-    UnloadAndDestroy(removed_entry);
-  }
-}
-
-ABSL_MUST_USE_RESULT CompiledSubgraph*
-TpuCompilationCacheExternal::MarkOldestEntryForEviction() {
-  CompiledSubgraph* entry_to_mark = entries_by_last_use_.begin()->second;
-  VLOG(1) << "Marking " << entry_to_mark->subgraph_key << " for eviction";
-  entries_by_last_use_.erase(entry_to_mark->last_use);
-  cache_size_ -= entry_to_mark->total_size;
-  marked_for_eviction_size_ += entry_to_mark->total_size;
-  // Discard the cache's reference to entry. If steps are holding onto
-  // references to entry it won't be deleted until the last step holding it
-  // completes. It stays in the cache in the meantime and can be resurrected
-  // by a call to CompileIfKeyAbsent if that occurs before the last reference
-  // expires.
-  return DiscardEntryRef(entry_to_mark);
-}
-
-void TpuCompilationCacheExternal::LookupEntryMarkedForEviction(
-    CompiledSubgraph* entry, std::vector<CompiledSubgraph*>* removed_entries) {
-  // The entry was previously marked for eviction (or is newly created) so
-  // unmark it. Add a reference (owned by the cache), update the cache size, and
-  // mark something old for eviction if necessary.
-  entry->Ref();
-  marked_for_eviction_size_ -= entry->total_size;
-  cache_size_ += entry->total_size;
-
-  // Mark the least-recently-used non-marked entry for eviction. Never mark the
-  // most-recently used entry (i.e., do nothing if entries_by_last_use_ == 1
-  // which means there's only one entry not already marked for eviction), so
-  // that an entry persists in the cache even if it is larger than the allocated
-  // cache size.
-  while (entries_by_last_use_.size() > 1 && cache_size_ > max_cache_size_) {
-    if (auto entry_to_evict = MarkOldestEntryForEviction()) {
-      removed_entries->push_back(entry_to_evict);
-    }
-  }
-}
-
-Status TpuCompilationCacheExternal::ToSubEntryRef(
-    CompilationCacheEntryRef* entry,
-    CompilationCacheFetchTarget fetch_target) const {
-  return static_cast<TpuEntryRefImpl*>(entry)->ToSubEntryRef(fetch_target);
-}
-
-TpuCompilationCacheExternal::TpuEntryRefImpl::TpuEntryRefImpl(
-    TpuCompilationCacheExternal* parent, CompiledSubgraph* entry, int index)
-    : parent_(parent), entry_(entry), index_(index) {
-  if (entry_ == nullptr) {
-    return;
-  }
-  if (entry_->main_entry == nullptr) {
-    entry_->Ref();
-  } else {
-    // This is a sharding/unsharding entry nested in a main entry. Only refcount
-    // the main entry.
-    entry_->main_entry->Ref();
-  }
-}
-
-TpuCompilationCacheExternal::TpuEntryRefImpl::~TpuEntryRefImpl() {
-  if (entry_ == nullptr) {
-    return;
-  }
-  if (entry_->main_entry == nullptr) {
-    parent_->DiscardEntryRefs({entry_});
-  } else {
-    parent_->DiscardEntryRefs({entry_->main_entry});
-  }
-}
-
-TpuCompilationCacheEntry TpuCompilationCacheExternal::TpuEntryRefImpl::get() {
-  if (entry_ == nullptr) {
-    // Create an empty entry if the entry is nullptr. This corresponds to
-    // non-existing sharding/unsharding entries.
-    return TpuCompilationCacheEntry();
-  }
-  return TpuCompilationCacheEntry(entry_->tpu_program_group.get(), index_);
-}
-
-Status TpuCompilationCacheExternal::TpuEntryRefImpl::ToSubEntryRef(
-    CompilationCacheFetchTarget fetch_target) {
-  CompiledSubgraph* target = nullptr;
-  switch (fetch_target) {
-    case CompilationCacheFetchTarget::MAIN:
-      target = entry_;
-      break;
-    case CompilationCacheFetchTarget::SHARDING:
-      target = entry_->sharding_entry.get();
-      break;
-    case CompilationCacheFetchTarget::UNSHARDING:
-      target = entry_->unsharding_entry.get();
-      break;
-    default:
-      return xla::InvalidArgument("Invalid fetch target: %d", fetch_target);
-  }
-
-  if (target == nullptr) {
-    // Cache entry does not have an unsharding subentry. Unref and replace
-    // with nullptr.
-    parent_->DiscardEntryRefs({entry_});
-  }
-  // Otherwise, since the refcount is always on the main entry, we don't need
-  // ref/unref.
-  entry_ = target;
-  return Status::OK();
-}
-
-Status TpuCompilationCacheExternal::Lookup(
-    int64 uid, int proto_index,
-    std::unique_ptr<CompilationCacheEntryRef>* entry) {
-  entry->reset();
-
-  profiler::TraceMe proto_lookup_traceme(
-      "TPU compilation cache proto lookup by uid",
-      /*level=*/2);
-
-  absl::MutexLock lock(&mu_);
-  const auto iter = entries_by_uid_.find(uid);
-  if (iter == entries_by_uid_.end()) {
-    return errors::NotFound("No subgraph found for uid ", uid);
-  }
-  CompiledSubgraph* cache_entry = iter->second;
-  if (proto_index < 0 ||
-      proto_index >= cache_entry->tpu_program_group->program_size()) {
-    return errors::NotFound("No proto found for core index ", proto_index,
-                            " in subgraph with uid ", uid);
-  }
-  *entry = std::unique_ptr<CompilationCacheEntryRef>(
-      new TpuEntryRefImpl(this, cache_entry, proto_index));
-  return Status::OK();
-}
-
-Status TpuCompilationCacheExternal::Lookup(
-    const string& proto_key, std::unique_ptr<CompilationCacheEntryRef>* entry) {
-  entry->reset();
-
-  profiler::TraceMe proto_lookup_traceme("TPU compilation cache proto lookup",
-                                         /*level=*/2);
-
-  absl::MutexLock lock(&mu_);
-  const auto iter = entries_by_proto_key_.find(proto_key);
-  if (iter == entries_by_proto_key_.end()) {
-    return errors::NotFound("No proto found for key ", proto_key);
-  }
-  CompiledSubgraph* cache_entry = iter->second.first;
-  int proto_index = iter->second.second;
-  *entry = std::unique_ptr<CompilationCacheEntryRef>(
-      new TpuEntryRefImpl(this, cache_entry, proto_index));
-  return Status::OK();
-}
-
-Status TpuCompilationCacheExternal::CompileIfKeyAbsentHelper(
-    const TpuCompilationCacheKey& subgraph_key,
-    const SessionMetadata* session_metadata,
-    TpuCompilationRefHolder* per_step_ref_holder, int64* uid,
-    std::vector<string>* proto_key, std::vector<bool>* may_modify_variables,
-    std::vector<CompiledSubgraph*>* removed_entries,
-    std::vector<std::shared_ptr<const xla::HloProto>>* hlo_metadata,
-    const std::function<Status(TpuProgramGroup*)>& compile_function) {
-  profiler::TraceMe subgraph_lookup_traceme(
-      "TPU compilation cache subgraph lookup",
-      /*level=*/2);
-
-  // NOTE: In spite of the fact that we use MutexLock, we do not hold the lock
-  // for the lifetime of the object, see InitializeEntry() call below.
-  absl::MutexLock lock(&mu_);
-
-  std::string cache_key = FindCacheKey(subgraph_key);
-  auto iter = cache_store_.find(cache_key);
-  bool is_new_key = iter == cache_store_.end();
-
-  const std::string session_name = SessionNameFromMetadata(session_metadata);
-
-  CompiledSubgraph* entry = nullptr;
-  if (is_new_key) {
-    cache_key = ConstructCompilationCacheKey(subgraph_key);
-    TpuCompilationCacheMetrics::IncrementCacheLookupCount(
-        /*is_cache_hit=*/false, session_name);
-    const string msg =
-        strings::StrCat("TPU host compilation cache miss: cache_key(",
-                        cache_key, "), session_name(", session_name, ")");
-
-    TRACESTRING(msg);
-    LOG(INFO) << msg;
-
-    // Check if caller has disabled compilation. Set using
-    // internal::ScopedTpuCompileDisabler.
-    if (!IsTpuCompilationEnabled()) {
-      const string error_msg = strings::StrCat(
-          "[TpuCompilationDisabled]: Compilation cache miss, but compilation "
-          "disabled, session_name(",
-          session_name, ") Debug String: ", subgraph_key.debug_string);
-      if (VLOG_IS_ON(2)) {
-        VLOG(2) << "Cache Missed. Current cache entries: ";
-        for (auto it = cache_store_.begin(); it != cache_store_.end(); ++it) {
-          // TODO(henrytan): add DebugKey as cache_entry_debug_string to
-          // TpuCompilationCacheKey.
-          VLOG(2) << "Cache Debug Info: ";
-          VLOG(2) << it->second->cache_entry_debug_string;
-        }
-      }
-
-      LOG_EVERY_N_SEC(WARNING, 30) << error_msg;
-      return errors::NotFound(error_msg);
-    }
-
-    // The single ref on the newly-created entry is owned by the caller.
-    VLOG(1) << "Before adding new entry for key " << cache_key
-            << " with session_name( " << session_name << ");"
-            << "; cache is " << cache_store_.size() << " entries ("
-            << cache_size_ + marked_for_eviction_size_ << " bytes), "
-            << " marked for eviction "
-            << (cache_store_.size() - entries_by_last_use_.size())
-            << " entries (" << marked_for_eviction_size_ << " bytes).";
-    // Note that InitializeEntry() will Release/Reacquire mu_.
-    entry = InitializeEntry(cache_key, compile_function, subgraph_key);
-    TRACELITERAL("TPU host compilation cache: compilation done.");
-
-    LOG(INFO) << strings::StrCat(
-        "TPU host compilation cache: compilation done for cache_key(",
-        cache_key, "), session_name(", session_name, ")");
-    // If session_name is present, log some additional stats related to HBM
-    // here, so that they can be associated directly to the session.
-    if (!session_name.empty()) {
-      entry->tpu_program_group->LogProgramMemorySummary();
-    }
-  } else {
-    TpuCompilationCacheMetrics::IncrementCacheLookupCount(true, session_name);
-    const string msg =
-        strings::StrCat("TPU host compilation cache hit: cache_key(", cache_key,
-                        "), session_name(", session_name, ")");
-    TRACESTRING(msg);
-    VLOG(1) << msg;
-    VLOG(1) << "Before refreshing entry for key " << cache_key
-            << " with session_name( " << session_name << "); cache is "
-            << cache_store_.size() << " entries ("
-            << cache_size_ + marked_for_eviction_size_ << " bytes), "
-            << " marked for eviction "
-            << (cache_store_.size() - entries_by_last_use_.size())
-            << " entries (" << marked_for_eviction_size_ << " bytes).";
-    entry = iter->second;
-    // Make a new reference that is owned by the caller.
-    entry->Ref();
-    // Block if necessary until the subgraph has been initialized.
-    mu_.Await(absl::Condition(
-        +[](CompiledSubgraph* e) { return e->initialized; }, entry));
-  }
-
-  // Let the caller know the uid of the entry.
-  *uid = entry->uid;
-  // Let the caller know the keys for each of the cached protos.
-  *proto_key = entry->proto_key;
-  *may_modify_variables = entry->tpu_program_group->may_modify_variables();
-  *hlo_metadata = entry->tpu_program_group->hlo_metadatas();
-
-  // If the caller didn't supply a per_step_ref_holder then the caller is going
-  // to manually release the reference later via a call to Release().
-  if (per_step_ref_holder == nullptr) {
-    ++entry->external_references;
-  } else {
-    // The caller wants its reference to be handed off to a per-step holder that
-    // will discard the reference when the step completes.
-    RefHolder* cast_ref_holder = static_cast<RefHolder*>(per_step_ref_holder);
-    TF_RET_CHECK(cast_ref_holder != nullptr);
-    cast_ref_holder->AddRef(entry);
-  }
-
-  // Remove the old LRU-table entry if it wasn't already marked for eviction.
-  auto erased = entries_by_last_use_.erase(entry->last_use);
-  // Update the LRU table indicating this entry is the most recently used.
-  entry->last_use = use_counter_++;
-  entries_by_last_use_[entry->last_use] = entry;
-  if (erased == 0) {
-    // The entry had been marked for eviction, or is newly created.
-    LookupEntryMarkedForEviction(entry, removed_entries);
-  }
-
-  // Log a little more verbosely when a key is added.
-  if (VLOG_IS_ON(1) || is_new_key) {
-    LOG(INFO) << "After " << (is_new_key ? "adding" : "refreshing")
-              << " entry for key " << cache_key << " with session_name "
-              << session_name << " cache is " << cache_store_.size()
-              << " entries (" << cache_size_ + marked_for_eviction_size_
-              << " bytes), "
-              << " marked for eviction "
-              << (cache_store_.size() - entries_by_last_use_.size())
-              << " entries (" << marked_for_eviction_size_ << " bytes).";
-  }
-  return entry->initialization_status;
-}
-
-tensorflow::Status TpuCompilationCacheExternal::CompileIfKeyAbsent(
-    const TpuCompilationCacheKey& cache_key,
-    const tensorflow::SessionMetadata* session_metadata,
-    TpuCompilationRefHolder* per_step_ref_holder, int64* uid,
-    std::vector<string>* proto_key, std::vector<bool>* may_modify_variables,
-    std::vector<std::shared_ptr<const xla::HloProto>>* hlo_metadata,
-    const std::function<tensorflow::Status(TpuProgramGroup*)>&
-        compile_function) {
-  std::vector<CompiledSubgraph*> removed_entries;
-  auto status = CompileIfKeyAbsentHelper(
-      cache_key, session_metadata, per_step_ref_holder, uid, proto_key,
-      may_modify_variables, &removed_entries, hlo_metadata, compile_function);
-  for (auto entry : removed_entries) {
-    UnloadAndDestroy(entry);
-  }
-  return status;
-}
-
 }  // namespace tpu
 }  // namespace tensorflow
diff --git a/tensorflow/core/tpu/kernels/tpu_compilation_cache_external.h b/tensorflow/core/tpu/kernels/tpu_compilation_cache_external.h
index eff2afde108..86615b15d4c 100644
--- a/tensorflow/core/tpu/kernels/tpu_compilation_cache_external.h
+++ b/tensorflow/core/tpu/kernels/tpu_compilation_cache_external.h
@@ -26,233 +26,43 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/platform/refcount.h"
 #include "tensorflow/core/protobuf/tpu/compile_metadata.pb.h"
 #include "tensorflow/core/tpu/kernels/compiled_subgraph.h"
 #include "tensorflow/core/tpu/kernels/tpu_compilation_cache.pb.h"
 #include "tensorflow/core/tpu/kernels/tpu_compilation_cache_entry.h"
+#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_entry_impl.h"
+#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.h"
 #include "tensorflow/core/tpu/kernels/tpu_compilation_cache_key.h"
 #include "tensorflow/core/tpu/kernels/tpu_compile_c_api.h"
 #include "tensorflow/core/tpu/kernels/tpu_compile_op_support.h"
 #include "tensorflow/core/tpu/kernels/tpu_mesh_state_interface.h"
+#include "tensorflow/core/tpu/kernels/tpu_op_consts.h"
 #include "tensorflow/core/tpu/kernels/tpu_program_group.h"
 
 namespace tensorflow {
 namespace tpu {
 
-const char kCompilationCacheResourceName[] = "tpu_compilation_cache";
-const char kCompilationCacheUnloaderResourceName[] =
-    "tpu_compilation_cache_unloader";
-
-// Base class that holds references to compiled protos so that the protos are
-// not garbage-collected before being used by execute ops. Use
-// TpuCompilationCache::MakePerStepRefHolder to create an instance of a concrete
-// ref holder object.
-class TpuCompilationRefHolder : public ResourceBase {
- public:
-  ~TpuCompilationRefHolder() override = default;
-};
-
-class TpuCompilationCacheExternal : public ResourceBase {
+class TpuCompilationCacheExternal : public TpuCompilationCacheInterface {
  public:
   using Status = ::stream_executor::port::Status;
 
-  explicit TpuCompilationCacheExternal(int64_t max_cache_size);
-  ~TpuCompilationCacheExternal() override;
-  TpuCompilationCacheExternal(const TpuCompilationCacheExternal&) = delete;
-  TpuCompilationCacheExternal& operator=(const TpuCompilationCacheExternal&) =
-      delete;
+  class EntryRefImpl
+      : public CompilationCacheEntryRefImpl<TpuCompilationCacheEntry> {
+   public:
+    EntryRefImpl(TpuCompilationCacheInterface* parent, CompiledSubgraph* entry,
+                 int index);
 
-  Status CompileIfKeyAbsent(
-      const TpuCompilationCacheKey& cache_key,
-      const SessionMetadata* session_metadata,
-      TpuCompilationRefHolder* per_step_ref_holder, int64* uid,
-      std::vector<string>* proto_key, std::vector<bool>* may_modify_variables,
-      std::vector<std::shared_ptr<const xla::HloProto>>* hlo_metadata,
-      const std::function<tensorflow::Status(TpuProgramGroup*)>&
-          compile_function);
+    TpuCompilationCacheEntry get() override;
+  };
 
-  static TpuCompilationCacheKey CreateCompilationCacheKey(
-      absl::string_view function_name, uint64 function_library_fingerprint,
-      absl::string_view mlir_module,
-      const tensorflow::OpInputList& guaranteed_constants,
-      const std::vector<tensorflow::TensorShape>& dynamic_shapes,
-      const tensorflow::tpu::TPUCompileMetadataProto& metadata,
-      const TpuMeshStateInterface& mesh_state);
+  explicit TpuCompilationCacheExternal(int64 max_cache_size)
+      : TpuCompilationCacheInterface(max_cache_size) {}
 
   string DebugString() const override { return "TpuCompilationCacheExternal"; }
 
-  // Makes a reference holder for this cache, that can be stored in the per-step
-  // resource manager and will ensure that compiled entries persist until the
-  // end of a step.
-  TpuCompilationRefHolder* MakePerStepRefHolder();
-
-  // Differences between MarkEntryForEviction and Release:
-  // There are two modes of managing cache entries:
-  // 1) LRU eviction + pinning; 2) manual.
-  // We use mode 1) if CompilationRefHolder is provided to CompileIfKeyAbsent.
-  // Otherwise it is manual mode (mainly used by XRT).
-  // MarkEntryForEviction should only be used in mode 1) to eagerly evict cache
-  // entries when callers know that they do not need them anymore.
-  // Release should only be used in mode 2) to explicitly remove an entry.
-
-  // Mark the entry indexed by `subgraph_uid` for eviction. This should only be
-  // called if per_step_ref_holder was NOT nullptr in the corresponding call to
-  // CompileIfKeyAbsent(subgraph_key, ...). Otherwise, use Release(int64
-  // subgraph_uid).
-  Status MarkEntryForEviction(int64 subgraph_uid);
-
-  // Manually discards a reference to the compiled subgraph. This should only be
-  // called if per_step_ref_holder was nullptr in the corresponding call to
-  // CompileIfKeyAbsent(subgraph_key, ...).
-  Status Release(int64 subgraph_uid);
-
-  // Looks up an executable corresponding to the model-parallel core index of
-  // the subgraph represented by key. On success a pointer to an EntryRef
-  // holding the program is returned in entry.
-  Status Lookup(const string& proto_key,
-                std::unique_ptr<CompilationCacheEntryRef>* entry);
-
-  // Looks up an executable corresponding to the model-parallel core index of
-  // the subgraph represented by uid. On success a pointer to an EntryRef
-  // holding the program is returned in entry.
-  Status Lookup(int64 uid, int proto_index,
-                std::unique_ptr<CompilationCacheEntryRef>* entry);
-
-  // Mutates the main entry ref to point to the entry's subentry
-  // (for sharding/unsharding) or main entry (unchanged) representing the
-  // fetch target. The entry ref needs to point to the main entry before this
-  // call.
-  //
-  // If the requested subentry does not exist, the ref will point to a nullptr
-  // entry.
-  Status ToSubEntryRef(CompilationCacheEntryRef* entry,
-                       CompilationCacheFetchTarget fetch_target) const;
-
  private:
-  // Wrapper for a cache entry that holds a reference to the entry until the
-  // wrapper is deleted. This wrapper is the concrete type of
-  // CompilationCacheEntryRef returned by Lookup.
-  class TpuEntryRefImpl : public CompilationCacheEntryRef {
-   public:
-    TpuEntryRefImpl(TpuCompilationCacheExternal* parent,
-                    CompiledSubgraph* entry, int index);
-    ~TpuEntryRefImpl() override;
-
-    TpuCompilationCacheEntry get() override;
-
-    // Mutates this ref to point to the entry's subentry (for
-    // sharding/unsharding) or main entry (unchanged) as specified by
-    // fetch_target. The refcount is kept unchanged, since we only track the
-    // refcount of the main entry. The entry ref needs to point to the main
-    // entry before this call.
-    //
-    // If the requested subentry does not exist, the ref will point to a nullptr
-    // entry, and the original entry will be unref'ed.
-    Status ToSubEntryRef(CompilationCacheFetchTarget fetch_target);
-
-   private:
-    TpuCompilationCacheExternal* parent_;  // Not owned.
-    // A reference to entry_ is acquired in the constructor and released via
-    // parent->DiscardEntryRefs in the destructor.
-    CompiledSubgraph* entry_;
-    // The program in entry_ that is returned by the get method.
-    int index_;
-  };
-
-  // Private implementation of the generic CompilationRefHolder that knows about
-  // CompiledSubgraph entries.
-  class RefHolder : public TpuCompilationRefHolder {
-   public:
-    explicit RefHolder(TpuCompilationCacheExternal* parent) : parent_(parent) {
-      parent_->Ref();
-    }
-    ~RefHolder() override {
-      // Release our reference to the parent.
-      parent_->Unref();
-    }
-
-    // Adds entry to the list of entries that will be released when the
-    // RefHolder is destroyed. Each entry is released via a call to
-    // parent_->DiscardEntryRefs.
-    void AddRef(CompiledSubgraph* entry) { entries_.push_back(entry); }
-
-    string DebugString() const override {
-      return "TpuCompilationCacheExternal::RefHolder";
-    }
-
-   private:
-    TpuCompilationCacheExternal* parent_;  // Not owned.
-    std::vector<CompiledSubgraph*> entries_;
-  };
-
-  // The bulk of implementation of CompileIfKeyAbsent() with the exception
-  // of unloading programs that corresponds to possibly removed cache
-  // entries. The split helps to manage locking since we prefer to perform
-  // unloading without holding extra locks.
-  Status CompileIfKeyAbsentHelper(
-      const TpuCompilationCacheKey& subgraph_key,
-      const SessionMetadata* session_metadata,
-      TpuCompilationRefHolder* per_step_ref_holder, int64* uid,
-      std::vector<string>* proto_key, std::vector<bool>* may_modify_variables,
-      std::vector<CompiledSubgraph*>* removed_entries,
-      std::vector<std::shared_ptr<const xla::HloProto>>* hlo_metadata,
-      const std::function<Status(TpuProgramGroup*)>& compile_function);
-
-  // This is called by the cache when entry is marked for eviction; by
-  // a RefHolder (via DiscardEntryRefs) when a step completes; and by
-  // an EntryRefImpl when it is destroyed. Releases one reference to entry
-  // if more than 1 remains. If only one reference is left, the entry is removed
-  // from cache_ and is returned to the caller; which must eventually call
-  // UnloadAndDestroy(). We do not call UnloadAndDestroy within DiscardEntryRef
-  // to avoid holding the lock during program unloading.
-  ABSL_MUST_USE_RESULT CompiledSubgraph* DiscardEntryRef(
-      CompiledSubgraph* entry) ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_);
-  // Convenience method called by ~RefHolder without mu_ held. Calls
-  // DiscardEntryRef on every element of entries.
-  void DiscardEntryRefs(gtl::ArraySlice<CompiledSubgraph*> entries);
-
-  // Marks the oldest unmarked entry for eviction. Requires that there is at
-  // least one such entry. In case the evicted entry had only 1 reference it
-  // is removed from the cache and returned to the caller which must eventually
-  // call UnloadAndDestroy.
-  CompiledSubgraph* MarkOldestEntryForEviction()
-      ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_);
-
-  // Updates datastructures to indicate that entry, which had been marked for
-  // eviction, has been looked up. This is called by CompileIfKeyAbsent when an
-  // entry is newly created, or an entry that has been marked for eviction but
-  // not yet evicted is looked up.
-  //
-  // First the entry is unmarked for eviction, i.e. the cache gains a reference
-  // to entry, entry's last_use field is set to be the most recent value of
-  // use_counter_ and entries_by_last_use_ is updated accordingly.
-  //
-  // Next, the size of the cache is examined to see if any other entries need to
-  // be marked for eviction now that entry has been unmarked. While the total
-  // size of unmarked cached entries is greater than max_cache_size_, entries
-  // are marked for eviction in LRU order. The most recently used entry is never
-  // marked for eviction, so an entry larger than the max cache size will remain
-  // in the cache until it is replaced by something else. In case some entries
-  // actually were removed from the cache, they are a returned to the caller via
-  // removed_entries. The caller must eventually delete them by calling
-  // UnloadAndDestroy.
-  void LookupEntryMarkedForEviction(
-      CompiledSubgraph* entry, std::vector<CompiledSubgraph*>* removed_entries)
-      ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_);
-
-  // Removes the entry with given key from cache.
-  size_t RemoveEntry(const string& key) ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_);
-
-  // Inserts the given key and entry to cache.
-  void InsertEntry(const std::string& key,
-                   const TpuCompilationCacheKey& subgraph_key,
-                   CompiledSubgraph* entry) ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_);
-
-  // Returns the cache key matching given subgraph_key.
-  std::string FindCacheKey(const TpuCompilationCacheKey& subgraph_key) const
-      ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_);
-
   // Creates a new entry by running initialize_programs and places it in the
   // cache to be looked up by key. The new entry is in the 'marked for eviction'
   // state (not present in entries_by_last_use_) and the caller is expected to
@@ -261,61 +71,10 @@ class TpuCompilationCacheExternal : public ResourceBase {
   // **InitializeEntry releases mu_ during the call to initialize_programs.**
   CompiledSubgraph* InitializeEntry(
       const string& key,
-      const std::function<Status(TpuProgramGroup*)>& initialize_program,
+      const std::function<Status(TpuProgramGroupInterface*)>&
+          initialize_program,
       const TpuCompilationCacheKey& subgraph_key)
-      ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_);
-
-  // Unloads the program associated with the entry from all local devices
-  // and deletes the entry itself. It is assumed no one else has a reference
-  // to it and all related keys had already been removed from the cache.
-  // The call can perform device IO so no locks should be held while calling it.
-  void UnloadAndDestroy(CompiledSubgraph* entry) ABSL_LOCKS_EXCLUDED(mu_);
-
-  // The maximum size of entries that are stored in the cache before entries are
-  // marked for eviction.
-  const int64 max_cache_size_;
-
-  mutable absl::Mutex mu_;
-  // The total size of entries that are stored and not marked for eviction.
-  int64 cache_size_ ABSL_GUARDED_BY(mu_) = 0;
-
-  // The total size of entries that are marked for eviction.
-  int64 marked_for_eviction_size_ ABSL_GUARDED_BY(mu_) = 0;
-
-  // The value to assign to the last_use field of the next entry that is looked
-  // up.
-  int64 use_counter_ ABSL_GUARDED_BY(mu_) = 0;
-
-  // session_key_map_ and fingerprint_key_map_ are used for looking up the
-  // cache_ key matching a given subgraph key. When doing a lookup, check
-  // session_key_map_ first to avoid unnecessay fingerprint computation.
-  // Map from key prefix + session_handle to a cache_ key.
-  std::unordered_map<string, string> session_key_map_ ABSL_GUARDED_BY(mu_);
-
-  // Map from key prefix + fingerprint to a cache_ key.
-  std::unordered_map<string, string> fingerprint_key_map_ ABSL_GUARDED_BY(mu_);
-
-  // All the subgraph entries that can be looked up in the cache. An entry is
-  // marked for eviction iff it is present in cache_ and not in
-  // entries_by_last_use_.
-  std::unordered_map<string, CompiledSubgraph*> cache_store_
-      ABSL_GUARDED_BY(mu_);
-
-  // All the subgraph entries that can be looked up in the cache, indexed by
-  // uid.
-  absl::node_hash_map<int64, CompiledSubgraph*> entries_by_uid_
-      ABSL_GUARDED_BY(mu_);
-
-  // All the protos that can be looked up in the cache, indexed by proto
-  // key. The value of the map is a subgraph and the index of the proto compiled
-  // for that subgraph.
-  std::unordered_map<string, std::pair<CompiledSubgraph*, int>>
-      entries_by_proto_key_ ABSL_GUARDED_BY(mu_);
-
-  // Map from last_use to entry, used to mark entries for eviction in LRU
-  // order. If an entry's last_use counter is not present as a key in
-  // entries_by_last_use_ then the entry has been marked for eviction.
-  std::map<int64, CompiledSubgraph*> entries_by_last_use_ ABSL_GUARDED_BY(mu_);
+      ABSL_EXCLUSIVE_LOCKS_REQUIRED(TpuCompilationCacheInterface::mu_) override;
 };
 
 }  // namespace tpu
diff --git a/tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.cc b/tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.cc
index f3e40df24dd..3b46f0f2d32 100644
--- a/tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.cc
+++ b/tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.cc
@@ -93,7 +93,9 @@ Status TpuCompilationCacheInterface::MarkEntryForEviction(int64 subgraph_uid) {
                               "use TpuCompilationCacheInterface::Release.");
     }
 
-    VLOG(1) << "Marking " << subgraph_to_evict->subgraph_key << " for eviction";
+    VLOG(1) << "Marking " << subgraph_to_evict->subgraph_key
+            << " for eviction. Debug string: "
+            << subgraph_to_evict->cache_entry_debug_string;
     entries_by_last_use_.erase(subgraph_to_evict->last_use);
     cache_size_ -= subgraph_to_evict->total_size;
     marked_for_eviction_size_ += subgraph_to_evict->total_size;
@@ -231,7 +233,9 @@ void TpuCompilationCacheInterface::DiscardEntryRefs(
 
 CompiledSubgraph* TpuCompilationCacheInterface::MarkOldestEntryForEviction() {
   CompiledSubgraph* entry_to_mark = entries_by_last_use_.begin()->second;
-  VLOG(1) << "Marking " << entry_to_mark->subgraph_key << " for eviction";
+  VLOG(1) << "Marking " << entry_to_mark->subgraph_key
+          << " for eviction. Debug string: "
+          << entry_to_mark->cache_entry_debug_string;
   entries_by_last_use_.erase(entry_to_mark->last_use);
   cache_size_ -= entry_to_mark->total_size;
   marked_for_eviction_size_ += entry_to_mark->total_size;
@@ -291,7 +295,7 @@ Status TpuCompilationCacheInterface::CompileIfKeyAbsent(
     const SessionMetadata* session_metadata,
     CompilationRefHolder* per_step_ref_holder, int64* uid,
     std::vector<string>* proto_key, std::vector<bool>* may_modify_variables,
-    std::vector<std::shared_ptr<const xla::HloProto>>* hlo_metadatas,
+    absl::Span<const xla::HloProto* const>* hlo_metadatas,
     const std::function<Status(TpuProgramGroupInterface*)>& compile_function) {
   std::vector<CompiledSubgraph*> removed_entries;
   auto status = CompileIfKeyAbsentHelper(
@@ -328,7 +332,7 @@ Status TpuCompilationCacheInterface::CompileIfKeyAbsentHelper(
     CompilationRefHolder* per_step_ref_holder, int64* uid,
     std::vector<string>* proto_key, std::vector<bool>* may_modify_variables,
     std::vector<CompiledSubgraph*>* removed_entries,
-    std::vector<std::shared_ptr<const xla::HloProto>>* hlo_metadatas,
+    absl::Span<const xla::HloProto* const>* hlo_metadatas,
     const std::function<Status(TpuProgramGroupInterface*)>& compile_function) {
   CompiledSubgraph* entry = nullptr;
 
@@ -388,7 +392,8 @@ Status TpuCompilationCacheInterface::CompileIfKeyAbsentHelper(
     TRACELITERAL("TPU host compilation cache: compilation done.");
     LOG(INFO) << strings::StrCat(
         "TPU host compilation cache: compilation done for cache_key(",
-        cache_key, "), session_name(", session_name, ")");
+        cache_key, "), session_name(", session_name, "), subgraph_key(",
+        subgraph_key.debug_string, ")");
     // If session_name is present, log some additional stats related to HBM
     // here, so that they can be associated directly to the session.
     if (!session_name.empty()) {
diff --git a/tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.h b/tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.h
new file mode 100644
index 00000000000..f92893b78f6
--- /dev/null
+++ b/tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.h
@@ -0,0 +1,355 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILATION_CACHE_INTERFACE_H_
+#define TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILATION_CACHE_INTERFACE_H_
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/container/node_hash_map.h"
+#include "absl/strings/str_cat.h"
+#include "absl/synchronization/mutex.h"
+#include "tensorflow/compiler/tf2xla/host_compute_metadata.pb.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/core/distributed_runtime/rpc/grpc_call.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/lib/core/refcount.h"
+#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/profiler/lib/traceme.h"
+#include "tensorflow/core/protobuf/config.pb.h"
+#include "tensorflow/core/tpu/kernels/compiled_subgraph.h"
+#include "tensorflow/core/tpu/kernels/tpu_compilation_cache.pb.h"
+#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_key.h"
+#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_metrics.h"
+#include "tensorflow/core/tpu/kernels/trace_util.h"
+
+namespace tensorflow {
+namespace tpu {
+
+// Base class that holds references to compiled protos so that the protos are
+// not garbage-collected before being used by execute ops. Use
+// TpuCompilationCache::MakePerStepRefHolder to create an instance of a concrete
+// ref holder object.
+class CompilationRefHolder : public ResourceBase {
+ public:
+  ~CompilationRefHolder() override = default;
+};
+
+// Base class for a reference to a cached tpu program. A unique_ptr to a
+// CompilationCacheEntryRef is returned by all the cache Lookup methods below,
+// and ensures the underlying proto is not garbage-collected until the client
+// discards the ptr.
+template <typename CacheEntryType>
+class CompilationCacheEntryRef {
+ public:
+  virtual ~CompilationCacheEntryRef() = default;
+
+  // Returns a CompilationCacheEntry that should not be used beyond the lifetime
+  // of the tpu::CompilationCacheEntryRef.
+  virtual CacheEntryType get() = 0;
+
+  // Mutates this ref to point to the entry's subentry (for
+  // sharding/unsharding) or main entry (unchanged) as specified by
+  // fetch_target. The refcount is kept unchanged, since we only track the
+  // refcount of the main entry. The entry ref needs to point to the main
+  // entry before this call.
+  //
+  // If the requested subentry does not exist, the ref will point to a nullptr
+  // entry, and the original entry will be unref'ed.
+  virtual Status ToSubEntryRef(CompilationCacheFetchTarget fetch_target) = 0;
+};
+
+class TpuCompilationCacheInterface : public ResourceBase {
+ public:
+  explicit TpuCompilationCacheInterface(int64 max_cache_size);
+  ~TpuCompilationCacheInterface() override;
+
+  // Ensures there is an entry for key present in the cache. By the time
+  // CompileIfKeyAbsent returns there is guaranteed to be an entry in the cache
+  // for key, and that entry will remain valid at least until
+  // per_step_ref_holder is deleted. The first call to CompileIfKeyAbsent with a
+  // key that is not in the cache will evaluate compile_function to compute the
+  // value to use in the entry. Subsequent calls with the same key will block
+  // until compile_function completes. Other cache reads and inserts may proceed
+  // on other threads while compile_function is executing. If
+  // per_step_ref_holder is nullptr then the caller is responsible for calling
+  // Release(subgraph_key) to manually discard its reference to the compiled
+  // program, once the caller will not look up the compiled program again.
+  //
+  // compile_function should compile the subgraph represented by key and fill in
+  // one TPUExecutableProto per model-parallel core into its passed argument. It
+  // should return OK if and only if compilation succeeds. The executable proto
+  // vector will be discarded on non-OK status.
+  Status CompileIfKeyAbsent(
+      const TpuCompilationCacheKey& subgraph_key,
+      const SessionMetadata* session_metadata,
+      CompilationRefHolder* per_step_ref_holder, int64* uid,
+      std::vector<string>* proto_key, std::vector<bool>* may_modify_variables,
+      absl::Span<const xla::HloProto* const>* hlo_metadatas,
+      const std::function<Status(TpuProgramGroupInterface*)>& compile_function);
+
+  // Differences between MarkEntryForEviction and Release:
+  // There are two modes of managing cache entries:
+  // 1) LRU eviction + pinning; 2) manual.
+  // We use mode 1) if CompilationRefHolder is provided to CompileIfKeyAbsent.
+  // Otherwise it is manual mode (mainly used by XRT).
+  // MarkEntryForEviction should only be used in mode 1) to eagerly evict cache
+  // entries when callers know that they do not need them anymore.
+  // Release should only be used in mode 2) to explicitly remove an entry.
+
+  // Mark the entry indexed by `subgraph_uid` for eviction. This should only be
+  // called if per_step_ref_holder was NOT nullptr in the corresponding call to
+  // CompileIfKeyAbsent(subgraph_key, ...). Otherwise, use Release(int64
+  // subgraph_uid).
+  Status MarkEntryForEviction(int64 subgraph_uid);
+
+  // Manually discards a reference to the compiled subgraph. This should only be
+  // called if per_step_ref_holder was nullptr in the corresponding call to
+  // CompileIfKeyAbsent(subgraph_key, ...).
+  Status Release(int64 subgraph_uid);
+
+  // Looks up an executable corresponding to the model-parallel core index of
+  // the subgraph represented by key. On success a pointer to an EntryRef
+  // holding the program is returned in entry.
+  template <typename CacheEntryRef, typename CacheEntryRefImpl>
+  Status Lookup(const string& proto_key, std::unique_ptr<CacheEntryRef>* entry);
+
+  // Looks up an executable corresponding to the model-parallel core index of
+  // the subgraph represented by uid. On success a pointer to an EntryRef
+  // holding the program is returned in entry.
+  template <typename CacheEntryRef, typename CacheEntryRefImpl>
+  Status Lookup(int64 uid, int proto_index,
+                std::unique_ptr<CacheEntryRef>* entry);
+
+  // Looks up the subgraph represented by uid, and returns the vector of keys,
+  // one per core, corresponding to that subgraph.
+  Status GetKeysFromUid(int64 uid, std::vector<string>* keys);
+
+  // Makes a reference holder for this cache, that can be stored in the per-step
+  // resource manager and will ensure that compiled entries persist until the
+  // end of a step.
+  CompilationRefHolder* MakePerStepRefHolder();
+
+  // Convenience method called by ~RefHolder without mu_ held. Calls
+  // DiscardEntryRef on every element of entries.
+  void DiscardEntryRefs(gtl::ArraySlice<CompiledSubgraph*> entries);
+
+  string DebugString() const override { return "TpuCompilationCacheBase"; }
+
+ protected:
+  std::string ConstructCompilationCacheKey(const TpuCompilationCacheKey& key) {
+    if (!key.has_guaranteed_const) {
+      return key.prefix;
+    }
+    return absl::StrCat(key.prefix, "|", key.session_handle, "|",
+                        key.guaranteed_const_fingerprint());
+  }
+
+  // Private implementation of the generic CompilationRefHolder that knows about
+  // CompiledSubgraph entries.
+  class RefHolder : public CompilationRefHolder {
+   public:
+    explicit RefHolder(TpuCompilationCacheInterface* parent);
+    ~RefHolder() override;
+
+    // Adds entry to the list of entries that will be released when the
+    // RefHolder is destroyed. Each entry is released via a call to
+    // parent_->DiscardEntryRefs.
+    void AddRef(CompiledSubgraph* entry);
+
+    string DebugString() const override;
+
+   private:
+    TpuCompilationCacheInterface* parent_;  // Not owned.
+    std::vector<CompiledSubgraph*> entries_;
+  };
+
+  // The bulk of implementation of CompileIfKeyAbsent() with the exception
+  // of unloading programs that corresponds to possibly removed cache
+  // entries. The split helps to manage locking since we prefer to perform
+  // unloading without holding extra locks.
+  Status CompileIfKeyAbsentHelper(
+      const TpuCompilationCacheKey& subgraph_key,
+      const SessionMetadata* session_metadata,
+      CompilationRefHolder* per_step_ref_holder, int64* uid,
+      std::vector<string>* proto_key, std::vector<bool>* may_modify_variables,
+      std::vector<CompiledSubgraph*>* removed_entries,
+      absl::Span<const xla::HloProto* const>* hlo_metadatas,
+      const std::function<Status(TpuProgramGroupInterface*)>& compile_function);
+
+  // This is called by the cache when entry is marked for eviction; by
+  // a RefHolder (via DiscardEntryRefs) when a step completes; and by
+  // an EntryRefImpl when it is destroyed. Releases one reference to entry
+  // if more than 1 remains. If only one reference is left, the entry is removed
+  // from cache_ and is returned to the caller; which must eventually call
+  // UnloadAndDestroy(). We do not call UnloadAndDestroy within DiscardEntryRef
+  // to avoid holding the lock during program unloading.
+  ABSL_MUST_USE_RESULT CompiledSubgraph* DiscardEntryRef(
+      CompiledSubgraph* entry) ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  // Marks the oldest unmarked entry for eviction. Requires that there is at
+  // least one such entry. In case the evicted entry had only 1 reference it
+  // is removed from the cache and returned to the caller which must eventually
+  // call UnloadAndDestroy.
+  ABSL_MUST_USE_RESULT CompiledSubgraph* MarkOldestEntryForEviction()
+      ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  // Updates datastructures to indicate that entry, which had been marked for
+  // eviction, has been looked up. This is called by CompileIfKeyAbsent when an
+  // entry is newly created, or an entry that has been marked for eviction but
+  // not yet evicted is looked up.
+  //
+  // First the entry is unmarked for eviction, i.e. the cache gains a reference
+  // to entry, entry's last_use field is set to be the most recent value of
+  // use_counter_ and entries_by_last_use_ is updated accordingly.
+  //
+  // Next, the size of the cache is examined to see if any other entries need to
+  // be marked for eviction now that entry has been unmarked. While the total
+  // size of unmarked cached entries is greater than max_cache_size_, entries
+  // are marked for eviction in LRU order. The most recently used entry is never
+  // marked for eviction, so an entry larger than the max cache size will remain
+  // in the cache until it is replaced by something else. In case some entries
+  // actually were removed from the cache, they are a returned to the caller via
+  // removed_entries. The caller must eventually delete them by calling
+  // UnloadAndDestroy.
+  void LookupEntryMarkedForEviction(
+      CompiledSubgraph* entry, std::vector<CompiledSubgraph*>* removed_entries)
+      ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  // Removes the entry with given key from cache.
+  size_t RemoveEntry(const string& key) ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  // Inserts the given key and entry to cache.
+  void InsertEntry(const string& key, CompiledSubgraph* entry)
+      ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  // Returns the cache key matching given subgraph_key.
+  string FindCacheKey(const TpuCompilationCacheKey& subgraph_key)
+      ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  // Creates a new entry by running initialize_programs and places it in the
+  // cache to be looked up by key. The new entry is in the 'marked for eviction'
+  // state (not present in entries_by_last_use_) and the caller is expected to
+  // call LookupEntryMarkedForEviction after InitializeEntry.
+  //
+  // **InitializeEntry releases mu_ during the call to initialize_programs.**
+  virtual CompiledSubgraph* InitializeEntry(
+      const string& key,
+      const std::function<Status(TpuProgramGroupInterface*)>&
+          initialize_programs,
+      const TpuCompilationCacheKey& subgraph_key)
+      ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_) = 0;
+
+  // Unloads the program associated with the entry from all local devices
+  // and deletes the entry itself. It is assumed no one else has a reference
+  // to it and all related keys had already been removed from the cache.
+  // The call can perform device IO so no locks should be held while calling it.
+  void UnloadAndDestroy(CompiledSubgraph* entry) ABSL_LOCKS_EXCLUDED(mu_);
+
+  // The maximum size of entries that are stored in the cache before entries are
+  // marked for eviction.
+  const int64 max_cache_size_;
+  // Mutex to protect access to shared resources under multi-threading
+  // environment.
+  absl::Mutex mu_;
+  // The total size of entries that are stored and not marked for eviction.
+  int64 cache_size_ ABSL_GUARDED_BY(mu_) = 0;
+  // The total size of entries that are marked for eviction.
+  int64 marked_for_eviction_size_ ABSL_GUARDED_BY(mu_) = 0;
+  // The value to assign to the last_use field of the next entry that is looked
+  // up.
+  int64 use_counter_ ABSL_GUARDED_BY(mu_) = 0;
+  // session_key_map_ and fingerprint_key_map_ are used for looking up the
+  // cache_ key matching a given subgraph key. When doing a lookup, check
+  // session_key_map_ first to avoid unnecessay fingerprint computation.
+  // Map from key prefix + session_handle to a cache_ key.
+  absl::node_hash_map<string, string> session_key_map_ ABSL_GUARDED_BY(mu_);
+  // Map from key prefix + fingerprint to a cache_ key.
+  absl::node_hash_map<string, string> fingerprint_key_map_ ABSL_GUARDED_BY(mu_);
+  // All the subgraph entries that can be looked up in the cache. An entry is
+  // marked for eviction iff it is present in cache_ and not in
+  // entries_by_last_use_.
+  std::unordered_map<string, CompiledSubgraph*> cache_ ABSL_GUARDED_BY(mu_);
+  // All the subgraph entries that can be looked up in the cache, indexed by
+  // uid.
+  absl::node_hash_map<int64, CompiledSubgraph*> entries_by_uid_
+      ABSL_GUARDED_BY(mu_);
+  // All the protos that can be looked up in the cache, indexed by proto
+  // key. The value of the map is a subgraph and the index of the proto compiled
+  // for that subgraph.
+  std::unordered_map<string, std::pair<CompiledSubgraph*, int>>
+      entries_by_proto_key_ ABSL_GUARDED_BY(mu_);
+  // Map from last_use to entry, used to mark entries for eviction in LRU
+  // order. If an entry's last_use counter is not present as a key in
+  // entries_by_last_use_ then the entry has been marked for eviction.
+  std::map<int64, CompiledSubgraph*> entries_by_last_use_ ABSL_GUARDED_BY(mu_);
+
+  TpuCompilationCacheMetrics tpu_compilation_cache_metrics_;
+
+ private:
+  TpuCompilationCacheInterface(const TpuCompilationCacheInterface&) = delete;
+  TpuCompilationCacheInterface& operator=(const TpuCompilationCacheInterface&) =
+      delete;
+};
+
+template <typename CacheEntryRef, typename CacheEntryRefImpl>
+Status TpuCompilationCacheInterface::Lookup(
+    int64 uid, int proto_index, std::unique_ptr<CacheEntryRef>* entry) {
+  entry->reset();
+
+  profiler::TraceMe proto_lookup_traceme(
+      "TPU compilation cache proto lookup by uid",
+      /*level=*/2);
+
+  absl::MutexLock lock(&mu_);
+  const auto iter = entries_by_uid_.find(uid);
+  if (iter == entries_by_uid_.end()) {
+    return errors::NotFound("No subgraph found for uid ", uid);
+  }
+  CompiledSubgraph* cache_entry = iter->second;
+  if (proto_index < 0 ||
+      proto_index >= cache_entry->tpu_program_group->program_count()) {
+    return errors::NotFound("No proto found for core index ", proto_index,
+                            " in subgraph with uid ", uid);
+  }
+  *entry = absl::make_unique<CacheEntryRefImpl>(this, cache_entry, proto_index);
+  return Status::OK();
+}
+
+template <typename CacheEntryRef, typename CacheEntryRefImpl>
+Status TpuCompilationCacheInterface::Lookup(
+    const string& proto_key, std::unique_ptr<CacheEntryRef>* entry) {
+  entry->reset();
+
+  profiler::TraceMe proto_lookup_traceme("TPU compilation cache proto lookup",
+                                         /*level=*/2);
+
+  absl::MutexLock lock(&mu_);
+  const auto iter = entries_by_proto_key_.find(proto_key);
+  if (iter == entries_by_proto_key_.end()) {
+    return errors::NotFound("No proto found for key ", proto_key);
+  }
+  CompiledSubgraph* cache_entry = iter->second.first;
+  int proto_index = iter->second.second;
+  *entry = absl::make_unique<CacheEntryRefImpl>(this, cache_entry, proto_index);
+  return Status::OK();
+}
+
+}  // namespace tpu
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILATION_CACHE_INTERFACE_H_
diff --git a/tensorflow/core/tpu/kernels/tpu_compilation_cache_lookup.cc b/tensorflow/core/tpu/kernels/tpu_compilation_cache_lookup.cc
index 8b2e832a69e..9285dff62ce 100644
--- a/tensorflow/core/tpu/kernels/tpu_compilation_cache_lookup.cc
+++ b/tensorflow/core/tpu/kernels/tpu_compilation_cache_lookup.cc
@@ -42,7 +42,7 @@ std::string GetName(CompilationCacheFetchTarget target) {
 }  // namespace
 
 TpuCompilationCacheLocalLookup::TpuCompilationCacheLocalLookup(
-    TpuCompilationCacheExternal* cache)
+    TpuCompilationCacheInterface* cache)
     : cache_(cache) {}
 
 TpuCompilationCacheLocalLookup::~TpuCompilationCacheLocalLookup() {
@@ -50,17 +50,19 @@ TpuCompilationCacheLocalLookup::~TpuCompilationCacheLocalLookup() {
 }
 
 Status TpuCompilationCacheLocalLookup::Lookup(
-    const string& proto_key, std::unique_ptr<CompilationCacheEntryRef>* entry,
+    const string& proto_key,
+    std::unique_ptr<TpuCompilationCacheEntryRef>* entry,
     CompilationCacheFetchTarget fetch_target) {
   profiler::TraceMe proto_lookup_traceme("Local TPU proto cache lookup",
                                          /*level=*/2);
-  Status s = cache_->Lookup(proto_key, entry);
+  Status s = cache_->Lookup<TpuCompilationCacheEntryRef, EntryRefImpl>(
+      proto_key, entry);
   VLOG(1) << "Looked up key " << proto_key << " in local subgraph cache status "
           << s;
   if (!s.ok()) {
     return s;
   }
-  s = cache_->ToSubEntryRef(entry->get(), fetch_target);
+  s = (*entry)->ToSubEntryRef(fetch_target);
 
   VLOG(1) << "Fetched subentry: " << GetName(fetch_target) << " with status "
           << s;
@@ -69,17 +71,18 @@ Status TpuCompilationCacheLocalLookup::Lookup(
 
 Status TpuCompilationCacheLocalLookup::Lookup(
     int64 uid, int proto_index,
-    std::unique_ptr<CompilationCacheEntryRef>* entry,
+    std::unique_ptr<TpuCompilationCacheEntryRef>* entry,
     CompilationCacheFetchTarget fetch_target) {
   profiler::TraceMe proto_lookup_traceme("Local TPU proto cache lookup by uid",
                                          /*level=*/2);
-  Status s = cache_->Lookup(uid, proto_index, entry);
+  Status s = cache_->Lookup<TpuCompilationCacheEntryRef, EntryRefImpl>(
+      uid, proto_index, entry);
   VLOG(1) << "Looked up uid " << uid << ", index " << proto_index
           << " in local subgraph cache status " << s;
   if (!s.ok()) {
     return s;
   }
-  s = cache_->ToSubEntryRef(entry->get(), fetch_target);
+  s = (*entry)->ToSubEntryRef(fetch_target);
   VLOG(1) << "Fetched subentry: " << GetName(fetch_target) << " with status "
           << s;
   return s;
diff --git a/tensorflow/core/tpu/kernels/tpu_compilation_cache_lookup.h b/tensorflow/core/tpu/kernels/tpu_compilation_cache_lookup.h
index 0d068e1bdd1..21ca74c46a8 100644
--- a/tensorflow/core/tpu/kernels/tpu_compilation_cache_lookup.h
+++ b/tensorflow/core/tpu/kernels/tpu_compilation_cache_lookup.h
@@ -12,13 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef EXPERIMENTAL_BRAIN_TPU_1VM_MINIEXECUTOR_TPU_COMPILATION_CACHE_LOOKUP_H_
-#define EXPERIMENTAL_BRAIN_TPU_1VM_MINIEXECUTOR_TPU_COMPILATION_CACHE_LOOKUP_H_
+#ifndef TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILATION_CACHE_LOOKUP_H_
+#define TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILATION_CACHE_LOOKUP_H_
 
 #include "tensorflow/core/lib/core/refcount.h"
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/tpu/kernels/tpu_compilation_cache.pb.h"
 #include "tensorflow/core/tpu/kernels/tpu_compilation_cache_entry.h"
+#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_external.h"
+#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.h"
 
 namespace tensorflow {
 namespace tpu {
@@ -28,6 +30,11 @@ namespace tpu {
 // and when they need to communicate over RPC.
 class TpuCompilationCacheLookup : public ResourceBase {
  public:
+  using TpuCompilationCacheEntryRef =
+      ::tensorflow::tpu::CompilationCacheEntryRef<TpuCompilationCacheEntry>;
+  using EntryRefImpl =
+      ::tensorflow::tpu::TpuCompilationCacheExternal::EntryRefImpl;
+
   ~TpuCompilationCacheLookup() override = default;
 
   // Looks up an executable corresponding to the model-parallel core index of
@@ -42,11 +49,11 @@ class TpuCompilationCacheLookup : public ResourceBase {
   // fetch_target requests one of them, then after this call
   //   (*entry)->get().get_executable() will return nullptr.
   virtual Status Lookup(const string& proto_key,
-                        std::unique_ptr<CompilationCacheEntryRef>* entry,
+                        std::unique_ptr<TpuCompilationCacheEntryRef>* entry,
                         CompilationCacheFetchTarget fetch_target) = 0;
 
   virtual Status Lookup(const string& proto_key,
-                        std::unique_ptr<CompilationCacheEntryRef>* entry) {
+                        std::unique_ptr<TpuCompilationCacheEntryRef>* entry) {
     return Lookup(proto_key, std::move(entry),
                   CompilationCacheFetchTarget::MAIN);
   }
@@ -56,33 +63,30 @@ class TpuCompilationCacheLookup : public ResourceBase {
   // returned in program. The wrapper is guaranteed to be valid only during the
   // execution of the Op requesting the proto.
   virtual Status Lookup(int64 uid, int proto_index,
-                        std::unique_ptr<CompilationCacheEntryRef>* entry,
+                        std::unique_ptr<TpuCompilationCacheEntryRef>* entry,
                         CompilationCacheFetchTarget fetch_target) = 0;
 
   virtual Status Lookup(int64 uid, int proto_index,
-                        std::unique_ptr<CompilationCacheEntryRef>* entry) {
+                        std::unique_ptr<TpuCompilationCacheEntryRef>* entry) {
     return Lookup(uid, proto_index, std::move(entry),
                   CompilationCacheFetchTarget::MAIN);
   }
 };
 
-// Forward declaration to break cycle dependency graph.
-class TpuCompilationCacheExternal;
-
 // Class for looking up ISA protos when the execute and compile Op are in the
 // same address space. The proto is simply looked up in the compilation cache,
 // without any serialization taking place.
 class TpuCompilationCacheLocalLookup : public TpuCompilationCacheLookup {
  public:
-  explicit TpuCompilationCacheLocalLookup(TpuCompilationCacheExternal* cache);
+  explicit TpuCompilationCacheLocalLookup(TpuCompilationCacheInterface* cache);
   ~TpuCompilationCacheLocalLookup() override;
 
   Status Lookup(const string& proto_key,
-                std::unique_ptr<CompilationCacheEntryRef>* entry,
+                std::unique_ptr<TpuCompilationCacheEntryRef>* entry,
                 CompilationCacheFetchTarget fetch_target) override;
 
   Status Lookup(int64 uid, int proto_index,
-                std::unique_ptr<CompilationCacheEntryRef>* entry,
+                std::unique_ptr<TpuCompilationCacheEntryRef>* entry,
                 CompilationCacheFetchTarget fetch_target) override;
 
   string DebugString() const override;
@@ -90,10 +94,10 @@ class TpuCompilationCacheLocalLookup : public TpuCompilationCacheLookup {
  private:
   // The subgraph compilation cache, in the same process address space where the
   // lookups are happening.
-  TpuCompilationCacheExternal* cache_;
+  TpuCompilationCacheInterface* cache_;
 };
 
 }  // namespace tpu
 }  // namespace tensorflow
 
-#endif  // EXPERIMENTAL_BRAIN_TPU_1VM_MINIEXECUTOR_TPU_COMPILATION_CACHE_LOOKUP_H_
+#endif  // TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILATION_CACHE_LOOKUP_H_
diff --git a/tensorflow/core/tpu/kernels/tpu_compile_c_api.h b/tensorflow/core/tpu/kernels/tpu_compile_c_api.h
index 70e3a7d2340..eab53fe9da4 100644
--- a/tensorflow/core/tpu/kernels/tpu_compile_c_api.h
+++ b/tensorflow/core/tpu/kernels/tpu_compile_c_api.h
@@ -18,6 +18,7 @@ limitations under the License.
 #include "tensorflow/core/tpu/kernels/tpu_mesh_state_c_api.h"
 #include "tensorflow/core/tpu/kernels/tpu_ops_common_c_api.h"
 #include "tensorflow/core/tpu/kernels/tpu_program_c_api.h"
+#include "tensorflow/core/tpu/libtftpu.h"
 #include "tensorflow/stream_executor/tpu/proto_helper.h"
 
 enum TpuCoreTypeEnum {
@@ -34,45 +35,64 @@ struct CompilationCacheKeyProperty {
   const char* mlir_module;
   const int32_t* device_ids;
   size_t device_ids_size;
-  int32_t guaranteed_constants_size;
+  size_t guaranteed_constants_size;
   uint64_t function_library_fingerprint;
   int32_t num_cores_per_replica;
   int32_t num_replicas;
   const XLA_TpuMeshState* mesh_state;
 };
 
+// Compilation cache key result returning both the key and a more verbose debug
+// version.
+struct CompilationCacheKeyResult {
+  const char* key;
+  const char* debug_string;
+};
+
 extern "C" {
 
 // Returns the number of available TPU core count.
-int TpuTopology_AvailableCoreCount(const XLA_TpuMeshState* mesh_state,
-                                   TpuCoreTypeEnum tpu_core_type);
+TFTPU_CAPI_EXPORT int TpuTopology_AvailableCoreCount(
+    const XLA_TpuMeshState* mesh_state, TpuCoreTypeEnum tpu_core_type);
 
 // Creates a unique compilation cache `key` used for `put` and `get` operations.
-// Returned buffer is heap-allocated and must be owned.
-const char* TpuCompile_CreateCompilationCacheKey(
-    CompilationCacheKeyProperty property);
+// Returned buffers are heap-allocated and must be owned.
+TFTPU_CAPI_EXPORT CompilationCacheKeyResult
+TpuCompile_CreateCompilationCacheKey(CompilationCacheKeyProperty property);
+
+// Destroys the CompilationCacheKeyResult returned by calling the
+// `TpuCompile_CreateCompilationCacheKey` API.
+TFTPU_CAPI_EXPORT void TpuCompile_DestroyCompilationCacheKey(
+    CompilationCacheKeyResult result);
 
 // Creates a guaranteed const fingerprint. Guarantee const is normally used in
 // TPU inference to avoid re-copying unchanged variables onto the TPU device.
 // It promises the value is identical for every execution in the same session
 // even if the actual value changes in later executions.
-uint64_t TpuCompile_CreateGuaranteedConstFingerprint(uint64_t fingerprint,
-                                                     const char* data,
-                                                     size_t size);
+TFTPU_CAPI_EXPORT uint64_t TpuCompile_CreateGuaranteedConstFingerprint(
+    uint64_t fingerprint, const char* data, size_t size);
 
 // Executes the computations using XLA TPU compiler and returns TPU programs
 // ready for execution.
-void TpuCompile_CompileAheadOfTime(
-    TpuSerializedProto aot_compilation_request,
-    XLA_TpuProgram** tpu_programs[],
+TFTPU_CAPI_EXPORT void TpuCompile_CompileAheadOfTime(
+    TpuSerializedProto aot_compilation_request, XLA_TpuProgram** tpu_programs[],
     size_t* count, SE_Status* status);
 
 // Builds `DeviceAssignment` from `TpuCompileMetadata` serialized proto.
-void TpuCompile_BuildXLADeviceAssignment(
+TFTPU_CAPI_EXPORT void TpuCompile_BuildXLADeviceAssignment(
     TpuSerializedProto serialized_tpu_compile_metadata,
     const XLA_TpuMeshState* mesh_state,
     TpuSerializedProto* serialized_device_assignment, SE_Status* status);
 
+struct TfTpu_CompileApiFn {
+  TFTPU_ADD_FN_IN_STRUCT(TpuTopology_AvailableCoreCount);
+  TFTPU_ADD_FN_IN_STRUCT(TpuCompile_CreateCompilationCacheKey);
+  TFTPU_ADD_FN_IN_STRUCT(TpuCompile_DestroyCompilationCacheKey);
+  TFTPU_ADD_FN_IN_STRUCT(TpuCompile_CreateGuaranteedConstFingerprint);
+  TFTPU_ADD_FN_IN_STRUCT(TpuCompile_CompileAheadOfTime);
+  TFTPU_ADD_FN_IN_STRUCT(TpuCompile_BuildXLADeviceAssignment);
+};
+
 }  // extern "C"
 
 #endif  // TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILE_C_API_H_
diff --git a/tensorflow/core/tpu/kernels/tpu_compile_op.h b/tensorflow/core/tpu/kernels/tpu_compile_op.h
new file mode 100644
index 00000000000..8a1963dde5c
--- /dev/null
+++ b/tensorflow/core/tpu/kernels/tpu_compile_op.h
@@ -0,0 +1,75 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILE_OP_H_
+#define TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILE_OP_H_
+
+#include <memory>
+
+#include "tensorflow/core/framework/op_kernel.h"
+
+namespace tensorflow {
+
+namespace tpu {
+// Forward declaration.
+class TpuCompileOpKernelImpl;
+}  // namespace tpu
+
+// The TPUCompile operator compiles a Tensorflow function into a
+// TPU executable to be run by TPUExecute.
+//
+class TpuCompileOp : public OpKernel {
+ public:
+  explicit TpuCompileOp(OpKernelConstruction* ctx);
+  ~TpuCompileOp() override = default;
+
+  void Compute(OpKernelContext* ctx) override;
+
+ private:
+  std::unique_ptr<tpu::TpuCompileOpKernelImpl> impl_;
+
+  DISALLOW_COPY_AND_ASSIGN(TpuCompileOp);
+};
+
+// The TPUCompile operator compiles a MLIR module into a
+// TPU executable to be run by TPUExecute.
+//
+class TpuCompileMlirOp : public OpKernel {
+ public:
+  explicit TpuCompileMlirOp(OpKernelConstruction* ctx);
+  ~TpuCompileMlirOp() override = default;
+
+  void Compute(OpKernelContext* ctx) override;
+
+ private:
+  std::unique_ptr<tpu::TpuCompileOpKernelImpl> impl_;
+
+  DISALLOW_COPY_AND_ASSIGN(TpuCompileMlirOp);
+};
+
+class TpuCompileSucceededAssertOp : public OpKernel {
+ public:
+  explicit TpuCompileSucceededAssertOp(OpKernelConstruction* ctx)
+      : OpKernel(ctx) {}
+  ~TpuCompileSucceededAssertOp() override = default;
+
+  void Compute(OpKernelContext* ctx) override;
+
+ private:
+  DISALLOW_COPY_AND_ASSIGN(TpuCompileSucceededAssertOp);
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILE_OP_H_
diff --git a/tensorflow/core/tpu/kernels/tpu_compile_op_common.cc b/tensorflow/core/tpu/kernels/tpu_compile_op_common.cc
index c8faba1d975..79556cfa544 100644
--- a/tensorflow/core/tpu/kernels/tpu_compile_op_common.cc
+++ b/tensorflow/core/tpu/kernels/tpu_compile_op_common.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/protobuf/tpu/compile_metadata.pb.h"
 #include "tensorflow/core/protobuf/tpu/dynamic_padding.pb.h"
+#include "tensorflow/core/tpu/kernels/tpu_program_group_interface.h"
 #include "tensorflow/core/tpu/kernels/tpu_util.h"
 #include "tensorflow/core/tpu/tpu_configuration.h"
 #include "tensorflow/core/tpu/tpu_defs.h"
@@ -352,7 +353,7 @@ Status TpuCompileOpKernelCommon::CompileTFFunctionToHlo(
     return;
   }
 
-  LogAndExit(42);
+  std::exit(42);
 }
 
 /* static */ Status TpuCompileOpKernelCommon::GetDynamicShapes(
diff --git a/tensorflow/core/tpu/kernels/tpu_compile_op_support.h b/tensorflow/core/tpu/kernels/tpu_compile_op_support.h
index 0f21e458828..36f9fa96db1 100644
--- a/tensorflow/core/tpu/kernels/tpu_compile_op_support.h
+++ b/tensorflow/core/tpu/kernels/tpu_compile_op_support.h
@@ -24,7 +24,6 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "tensorflow/cc/framework/ops.h"
 #include "tensorflow/compiler/tf2xla/xla_compiler.h"
-#include "tensorflow/compiler/xla/client/compile_only_client.h"
 #include "tensorflow/compiler/xla/service/hlo_module_config.h"
 #include "tensorflow/compiler/xla/service/hlo_module_group.h"
 #include "tensorflow/compiler/xla/service/hlo_sharding.h"
diff --git a/tensorflow/core/tpu/kernels/tpu_configuration_ops.cc b/tensorflow/core/tpu/kernels/tpu_configuration_ops.cc
index 7fa345d735c..583f1aec207 100644
--- a/tensorflow/core/tpu/kernels/tpu_configuration_ops.cc
+++ b/tensorflow/core/tpu/kernels/tpu_configuration_ops.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/platform/refcount.h"
 #include "tensorflow/core/tpu/kernels/tpu_mesh_state_interface.h"
+#include "tensorflow/core/tpu/tpu_api.h"
 #include "tensorflow/core/tpu/tpu_config_c_api.h"
 #include "tensorflow/core/tpu/tpu_configuration.h"
 #include "tensorflow/core/tpu/tpu_defs.h"
@@ -97,13 +98,14 @@ void ConfigureDistributedTpuOp::Compute(OpKernelContext* ctx) {
   OP_REQUIRES_OK(ctx, DeleteIfExists<tpu::TpuMeshStateInterface>(
                           rmgr, tpu::kTpuMeshCommonStateResourceName));
 
-  ConfigureDistributedTpuOp_DoWork(
+  tpu::ConfigApiFn()->ConfigureDistributedTpuOp_DoWorkFn(
       num_devices_per_host.size(), num_devices_per_host.data(),
       &host_config_output_size, &host_config_output, status);
 
-  OP_REQUIRES_OK(ctx, rmgr->Create(rmgr->default_container(),
-                                   tpu::kTpuMeshCommonStateResourceName,
-                                   tpu::TpuMeshStateInterface::Create()));
+  auto* tpu_mesh = tpu::TpuMeshStateInterface::Create();
+  OP_REQUIRES_OK(ctx,
+                 rmgr->Create(rmgr->default_container(),
+                              tpu::kTpuMeshCommonStateResourceName, tpu_mesh));
 
   Tensor* ctx_output;
   OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &ctx_output));
@@ -112,7 +114,8 @@ void ConfigureDistributedTpuOp::Compute(OpKernelContext* ctx) {
 
   OP_REQUIRES_OK(ctx, StatusFromTF_Status(status));
   TF_DeleteStatus(status);
-  TpuConfigurationApi_FreeCharArray(host_config_output);
+
+  tpu::ConfigApiFn()->TpuConfigurationApi_FreeCharArrayFn(host_config_output);
 
   VLOG(1) << "ConfigureDistributedTpuOp done";
 }
@@ -171,7 +174,7 @@ void WaitForDistributedTpuOp::Compute(OpKernelContext* ctx) {
   OP_REQUIRES_OK(ctx, GetTpuMeshStateInterface(rmgr, &mesh_state));
   core::ScopedUnref mesh_state_unref(mesh_state);
 
-  WaitForDistributedTpuOp_DoWork(
+  tpu::ConfigApiFn()->WaitForDistributedTpuOp_DoWorkFn(
       num_hosts, num_devices_per_host,
       const_cast<const int32_t**>(mapping_arg.data()), mesh_state,
       &tpu_topology_output_size, &tpu_topology_output, status);
@@ -183,7 +186,7 @@ void WaitForDistributedTpuOp::Compute(OpKernelContext* ctx) {
 
   OP_REQUIRES_OK(ctx, StatusFromTF_Status(status));
   TF_DeleteStatus(status);
-  TpuConfigurationApi_FreeCharArray(tpu_topology_output);
+  tpu::ConfigApiFn()->TpuConfigurationApi_FreeCharArrayFn(tpu_topology_output);
 
   VLOG(1) << "WaitForDistributedTpuOp done";
 }
@@ -196,7 +199,7 @@ void ShutdownDistributedTpuOp::Compute(OpKernelContext* ctx) {
   OP_REQUIRES_OK(ctx, DeleteIfExists<tpu::TpuMeshStateInterface>(
                           GetTPUConfigResourceMgr(),
                           tpu::kTpuMeshCommonStateResourceName));
-  ShutdownDistributedTpuOp_DoWork(status);
+  tpu::ConfigApiFn()->ShutdownDistributedTpuOp_DoWorkFn(status);
   OP_REQUIRES_OK(ctx, StatusFromTF_Status(status));
   TF_DeleteStatus(status);
 
@@ -213,7 +216,7 @@ void InitializeHostForDistributedTpuOp::Compute(OpKernelContext* ctx) {
   int32_t* device_id_output;
   TF_Status* status = TF_NewStatus();
 
-  InitializeHostForDistributedTpuOp_DoWork(
+  tpu::ConfigApiFn()->InitializeHostForDistributedTpuOp_DoWorkFn(
       tpu_host_config.size(), tpu_host_config.data(),
       enable_whole_mesh_compilations_, &device_id_output_size,
       &device_id_output, status);
@@ -230,7 +233,7 @@ void InitializeHostForDistributedTpuOp::Compute(OpKernelContext* ctx) {
 
   OP_REQUIRES_OK(ctx, StatusFromTF_Status(status));
   TF_DeleteStatus(status);
-  TpuConfigurationApi_FreeInt32Array(device_id_output);
+  tpu::ConfigApiFn()->TpuConfigurationApi_FreeInt32ArrayFn(device_id_output);
 
   VLOG(1) << "InitializeHostForDistributedTpuOp done";
 }
@@ -242,7 +245,8 @@ void SetGlobalTPUArrayOp::Compute(OpKernelContext* ctx) {
   auto tpu_topology = ctx->input(0).scalar<tstring>()();
   TF_Status* status = TF_NewStatus();
 
-  SetGlobalTPUArrayOp_DoWork(tpu_topology.size(), tpu_topology.data(), status);
+  tpu::ConfigApiFn()->SetGlobalTPUArrayOp_DoWorkFn(tpu_topology.size(),
+                                                   tpu_topology.data(), status);
 
   OP_REQUIRES_OK(ctx, StatusFromTF_Status(status));
   TF_DeleteStatus(status);
@@ -257,7 +261,8 @@ void DisconnectDistributedTpuChipsOp::Compute(OpKernelContext* ctx) {
   TF_Status* status = TF_NewStatus();
   int32_t number_of_chips_output = 0;
 
-  DisconnectDistributedTpuChipsOp_DoWork(&number_of_chips_output, status);
+  tpu::ConfigApiFn()->DisconnectDistributedTpuChipsOp_DoWorkFn(
+      &number_of_chips_output, status);
 
   Tensor* ctx_output;
   OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &ctx_output));
diff --git a/tensorflow/core/tpu/kernels/tpu_mesh_state_c_api.h b/tensorflow/core/tpu/kernels/tpu_mesh_state_c_api.h
index 3ed65fe5cc4..a6434d7d2fd 100644
--- a/tensorflow/core/tpu/kernels/tpu_mesh_state_c_api.h
+++ b/tensorflow/core/tpu/kernels/tpu_mesh_state_c_api.h
@@ -15,20 +15,29 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_TPU_KERNELS_TPU_MESH_STATE_C_API_H_
 #define TENSORFLOW_CORE_TPU_KERNELS_TPU_MESH_STATE_C_API_H_
 
+#include "tensorflow/core/tpu/libtftpu.h"
+
 typedef struct XLA_TpuMeshState XLA_TpuMeshState;
 
 extern "C" {
 
 // Creates a new TPU mesh state object.
-XLA_TpuMeshState* TpuMeshState_Create();
+TFTPU_CAPI_EXPORT XLA_TpuMeshState* TpuMeshState_Create();
 
 // Deletes the given TPU `mesh_state` object. Once deleted the object is
 // unusable.
-void TpuMeshState_Free(XLA_TpuMeshState* mesh_state);
+TFTPU_CAPI_EXPORT void TpuMeshState_Free(XLA_TpuMeshState* mesh_state);
 
 // Returns a pointer to an opaque mesh data structure used internally.
-void* TpuMeshState_MeshCommonState(XLA_TpuMeshState* mesh_state);
+TFTPU_CAPI_EXPORT void* TpuMeshState_MeshCommonState(
+    XLA_TpuMeshState* mesh_state);
 
 }  // extern "C"
 
+struct TfTpu_MeshStateApiFn {
+  TFTPU_ADD_FN_IN_STRUCT(TpuMeshState_Create);
+  TFTPU_ADD_FN_IN_STRUCT(TpuMeshState_Free);
+  TFTPU_ADD_FN_IN_STRUCT(TpuMeshState_MeshCommonState);
+};
+
 #endif  // TENSORFLOW_CORE_TPU_KERNELS_TPU_MESH_STATE_C_API_H_
diff --git a/tensorflow/core/tpu/kernels/tpu_mesh_state_interface.h b/tensorflow/core/tpu/kernels/tpu_mesh_state_interface.h
index 34202a78718..e2ac38b5f84 100644
--- a/tensorflow/core/tpu/kernels/tpu_mesh_state_interface.h
+++ b/tensorflow/core/tpu/kernels/tpu_mesh_state_interface.h
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/core/protobuf/tpu/compile_metadata.pb.h"
 #include "tensorflow/core/tpu/kernels/tpu_compile_c_api.h"
 #include "tensorflow/core/tpu/kernels/tpu_mesh_state_c_api.h"
+#include "tensorflow/core/tpu/tpu_api.h"
 
 namespace tensorflow {
 
@@ -38,19 +39,19 @@ class TpuMeshStateInterface : public tensorflow::ResourceBase {
 
   ~TpuMeshStateInterface() override {
     if (mesh_state_ != nullptr) {
-      TpuMeshState_Free(mesh_state_);
+      MeshStateApiFn()->TpuMeshState_FreeFn(mesh_state_);
     }
   }
 
   static TpuMeshStateInterface* Create() {
-    return new TpuMeshStateInterface(TpuMeshState_Create());
+    return new TpuMeshStateInterface(MeshStateApiFn()->TpuMeshState_CreateFn());
   }
 
   const XLA_TpuMeshState* data() const { return mesh_state_; }
 
   tensorflow::TpuMeshCommonState* mesh_common_state() const {
     return static_cast<tensorflow::TpuMeshCommonState*>(
-        TpuMeshState_MeshCommonState(mesh_state_));
+        MeshStateApiFn()->TpuMeshState_MeshCommonStateFn(mesh_state_));
   }
 
   // Returns whether we should include the device assignment as a static field
@@ -62,8 +63,8 @@ class TpuMeshStateInterface : public tensorflow::ResourceBase {
     // Static device assignment enables XLA to perform certain optimization when
     // all cores are used in the replicated computation.
     return metadata.num_cores_per_replica() * metadata.num_replicas() ==
-           TpuTopology_AvailableCoreCount(mesh_state_,
-                                          tpu_core_type);
+           CompileApiFn()->TpuTopology_AvailableCoreCountFn(mesh_state_,
+                                                            tpu_core_type);
   }
 
   string DebugString() const override { return "TpuMeshStateInterface"; }
diff --git a/tensorflow/core/tpu/kernels/tpu_op_consts.cc b/tensorflow/core/tpu/kernels/tpu_op_consts.cc
new file mode 100644
index 00000000000..e5e1aacb3cc
--- /dev/null
+++ b/tensorflow/core/tpu/kernels/tpu_op_consts.cc
@@ -0,0 +1,24 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/tpu/kernels/tpu_op_consts.h"
+
+namespace tensorflow {
+namespace tpu {
+const char kCompilationCacheResourceName[] = "tpu_compilation_cache";
+const char kCompiledProtoCacheResourceName[] = "tpu_proto_cache";
+const char kCompilationCacheUnloaderResourceName[] =
+    "tpu_compilation_cache_unloader";
+}  // namespace tpu
+}  // namespace tensorflow
diff --git a/tensorflow/core/tpu/kernels/tpu_op_consts.h b/tensorflow/core/tpu/kernels/tpu_op_consts.h
new file mode 100644
index 00000000000..25223b7e429
--- /dev/null
+++ b/tensorflow/core/tpu/kernels/tpu_op_consts.h
@@ -0,0 +1,39 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TPU_KERNELS_TPU_OP_CONSTS_H_
+#define TENSORFLOW_CORE_TPU_KERNELS_TPU_OP_CONSTS_H_
+
+#include "absl/base/attributes.h"
+
+namespace tensorflow {
+namespace tpu {
+
+// Resource names in the ResourceMgr.
+//
+// Name of cache for compiled TPU ISA protos. CompilationCache is created by
+// ConfigureDistributedTpuOp, so only the master has a CompilationCache.
+ABSL_CONST_INIT extern const char kCompilationCacheResourceName[];
+// Name of base class allowing Execute Ops to look up ISA protos.
+// CompiledProtoCache is created by InitializeHostForDistributedTpuOp, so each
+// tpu_worker has a CompiledProtoCache.
+ABSL_CONST_INIT extern const char kCompiledProtoCacheResourceName[];
+// Name of cache unloader for compiled TPU ISA protos. Cache unloader should be
+// put into TPU_SYSTEM device resource manager. Inference may use it to unload
+// cache entries created during lifetime of a DirectSession.
+ABSL_CONST_INIT extern const char kCompilationCacheUnloaderResourceName[];
+
+}  // namespace tpu
+}  // namespace tensorflow
+#endif  // TENSORFLOW_CORE_TPU_KERNELS_TPU_OP_CONSTS_H_
diff --git a/tensorflow/core/tpu/kernels/tpu_op_util.cc b/tensorflow/core/tpu/kernels/tpu_op_util.cc
new file mode 100644
index 00000000000..31b0cc6c72d
--- /dev/null
+++ b/tensorflow/core/tpu/kernels/tpu_op_util.cc
@@ -0,0 +1,170 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/tpu/kernels/tpu_op_util.h"
+
+#include <string>
+
+#include "tensorflow/core/lib/gtl/cleanup.h"
+#include "tensorflow/core/tpu/kernels/tpu_compile_c_api.h"
+
+namespace tensorflow {
+namespace tpu {
+namespace {
+// Return fingerprint_in_metadata if it's not empty; otherwise read input tensor
+// data to compute the fingerprint.
+std::string GuaranteedConstFingerprint(const string& fingerprint_in_metadata,
+                                       const Tensor* guaranteed_constants,
+                                       size_t guaranteed_constants_size) {
+  if (fingerprint_in_metadata.empty()) {
+    uint64_t fingerprint = 0;
+    for (size_t i = 0; i < guaranteed_constants_size; ++i) {
+      const Tensor& constant = guaranteed_constants[i];
+      fingerprint = TpuCompile_CreateGuaranteedConstFingerprint(
+          fingerprint, constant.tensor_data().data(),
+          constant.tensor_data().size());
+    }
+    return std::to_string(fingerprint);
+  } else {
+    return fingerprint_in_metadata;
+  }
+}
+
+std::string CreateShapePrefix(
+    const std::vector<tensorflow::TensorShape>& dynamic_shapes) {
+  std::string shapes_prefix;
+  for (const TensorShape& shape : dynamic_shapes) {
+    for (int64 size : shape.dim_sizes()) {
+      absl::StrAppend(&shapes_prefix, size, ",");
+    }
+    absl::StrAppend(&shapes_prefix, ";");
+  }
+  return shapes_prefix;
+}
+
+// Include compilation configurations of the arguments that are not captured
+// by the called graph.
+std::string CreateConfigPrefix(const TPUCompileMetadataProto& metadata) {
+  std::string config_prefix;
+  for (const auto& arg : metadata.args()) {
+    if (arg.is_same_data_across_replicas()) {
+      absl::StrAppend(&config_prefix, ":s");
+      // Same.
+    } else {
+      // Different.
+      absl::StrAppend(&config_prefix, ":");
+    }
+    if (arg.enable_xla_sharding() ==
+        tpu::TPUCompileMetadataProto::Arg::ALLOWED) {
+      // Enabled.
+      absl::StrAppend(&config_prefix, "e");
+    }
+    if (arg.unrestricted_layout()) {
+      // Unrestricted.
+      absl::StrAppend(&config_prefix, ":u");
+    }
+    absl::StrAppend(&config_prefix, ",type(", arg.dtype(), ")");
+    if (arg.has_shape()) {
+      absl::StrAppend(&config_prefix, ",shape(");
+      for (const auto& dim : arg.shape().dim()) {
+        absl::StrAppend(&config_prefix, dim.size(), ",");
+      }
+      absl::StrAppend(&config_prefix, ")");
+    }
+  }
+  return config_prefix;
+}
+}  // namespace
+
+// The `guaranteed_constants` must be passed as reference due to the lazy
+// evaluation of `guaranteed_const_fingerprint()` callback.
+TpuCompilationCacheKey CreateCompilationCacheKey(
+    absl::string_view function_name, uint64 function_library_fingerprint,
+    absl::string_view mlir_module, const Tensor* guaranteed_constants,
+    size_t guaranteed_constants_size,
+    const std::vector<TensorShape>& dynamic_shapes,
+    const TPUCompileMetadataProto& metadata,
+    const TpuMeshStateInterface& mesh_state) {
+  VLOG(1) << "FunctionLibraryFingerprint:" << function_library_fingerprint;
+  std::string shapes_prefix = CreateShapePrefix(dynamic_shapes);
+  VLOG(1) << "shapes_prefix = " << shapes_prefix;
+  std::string config_prefix = CreateConfigPrefix(metadata);
+  VLOG(1) << "config_prefix = " << config_prefix;
+  std::vector<int32_t> flattened_device_ids;
+  if (metadata.has_device_assignment()) {
+    for (const auto& device :
+         metadata.device_assignment().computation_devices()) {
+      flattened_device_ids.insert(flattened_device_ids.end(),
+                                  device.replica_device_ids().begin(),
+                                  device.replica_device_ids().end());
+    }
+  }
+  CompilationCacheKeyResult result =
+      TpuCompile_CreateCompilationCacheKey(CompilationCacheKeyProperty{
+          config_prefix.data(),
+          shapes_prefix.data(),
+          function_name.data(),
+          mlir_module.data(),
+          flattened_device_ids.data(),
+          flattened_device_ids.size(),
+          guaranteed_constants_size,
+          function_library_fingerprint,
+          metadata.num_cores_per_replica(),
+          metadata.num_replicas(),
+          mesh_state.data(),
+      });
+  auto buffer_cleanup = gtl::MakeCleanup(
+      [result]() { TpuCompile_DestroyCompilationCacheKey(result); });
+  TpuCompilationCacheKey key;
+  key.prefix = result.key;
+  key.debug_string = result.debug_string;
+
+  // Guaranteed constants can be different across sessions. Use session_handle
+  // and guaranteed_const fingerprint to guarantee no collision.
+  if (guaranteed_constants != nullptr && guaranteed_constants_size > 0) {
+    key.has_guaranteed_const = true;
+    key.session_handle = metadata.session_handle();
+    // Both `metadata` and `guaranteed_constants` lifetime are captured by
+    // reference based on the assumption that these variables lifetime is
+    // managed through the `TPUCompileOpKernelImpl` that outlives the
+    // lifetime of the compilation cache lookups.
+    string fingerprint;
+    key.guaranteed_const_fingerprint = [&metadata, guaranteed_constants,
+                                        guaranteed_constants_size,
+                                        fingerprint]() mutable {
+      if (fingerprint.empty()) {
+        fingerprint = GuaranteedConstFingerprint(
+            metadata.guaranteed_const_fingerprint(), guaranteed_constants,
+            guaranteed_constants_size);
+      }
+      return fingerprint;
+    };
+  }
+  return key;
+}
+
+TpuCompilationCacheKey CreateCompilationCacheKey(
+    absl::string_view function_name, uint64 function_library_fingerprint,
+    absl::string_view mlir_module, const OpInputList& guaranteed_constants,
+    size_t guaranteed_constants_size,
+    const std::vector<TensorShape>& dynamic_shapes,
+    const TPUCompileMetadataProto& metadata,
+    const TpuMeshStateInterface& mesh_state) {
+  return CreateCompilationCacheKey(
+      function_name, function_library_fingerprint, mlir_module,
+      (guaranteed_constants.size() > 0 ? &guaranteed_constants[0] : nullptr),
+      guaranteed_constants.size(), dynamic_shapes, metadata, mesh_state);
+}
+}  // namespace tpu
+}  // namespace tensorflow
diff --git a/tensorflow/core/tpu/kernels/tpu_op_util.h b/tensorflow/core/tpu/kernels/tpu_op_util.h
new file mode 100644
index 00000000000..bbaa05682e6
--- /dev/null
+++ b/tensorflow/core/tpu/kernels/tpu_op_util.h
@@ -0,0 +1,47 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TPU_KERNELS_TPU_OP_UTIL_H_
+#define TENSORFLOW_CORE_TPU_KERNELS_TPU_OP_UTIL_H_
+
+#include <vector>
+
+#include "absl/strings/string_view.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/protobuf/tpu/compile_metadata.pb.h"
+#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_key.h"
+#include "tensorflow/core/tpu/kernels/tpu_mesh_state_interface.h"
+
+namespace tensorflow {
+namespace tpu {
+// Creates a unique compilation cache `key`.
+TpuCompilationCacheKey CreateCompilationCacheKey(
+    absl::string_view function_name, uint64 function_library_fingerprint,
+    absl::string_view mlir_module, const OpInputList& guaranteed_constants,
+    const std::vector<TensorShape>& dynamic_shapes,
+    const TPUCompileMetadataProto& metadata,
+    const TpuMeshStateInterface& mesh_state);
+TpuCompilationCacheKey CreateCompilationCacheKey(
+    absl::string_view function_name, uint64 function_library_fingerprint,
+    absl::string_view mlir_module, const Tensor* guaranteed_constants,
+    size_t guaranteed_constants_size,
+    const std::vector<TensorShape>& dynamic_shapes,
+    const TPUCompileMetadataProto& metadata,
+    const TpuMeshStateInterface& mesh_state);
+}  // namespace tpu
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TPU_KERNELS_TPU_OP_UTIL_H_
diff --git a/tensorflow/core/tpu/kernels/tpu_program_group.cc b/tensorflow/core/tpu/kernels/tpu_program_group.cc
index 43452b912ec..ecda2ef062e 100644
--- a/tensorflow/core/tpu/kernels/tpu_program_group.cc
+++ b/tensorflow/core/tpu/kernels/tpu_program_group.cc
@@ -209,15 +209,8 @@ xla::HloProto TpuProgramGroup::hlo_metadata(int core_index) const {
       serialized_hlo_proto);
 }
 
-std::vector<std::shared_ptr<const xla::HloProto>>
-TpuProgramGroup::hlo_metadatas() const {
-  const size_t metadata_count = program_count();
-  std::vector<std::shared_ptr<const xla::HloProto>> hlo_metadatas;
-  hlo_metadatas.resize(metadata_count);
-  for (size_t i = 0; i < metadata_count; ++i) {
-    hlo_metadatas[i] = std::make_shared<const xla::HloProto>(hlo_metadata(i));
-  }
-  return hlo_metadatas;
+absl::Span<const xla::HloProto* const> TpuProgramGroup::hlo_metadatas() const {
+  return absl::MakeConstSpan(hlo_metadatas_);
 }
 
 }  // namespace tpu
diff --git a/tensorflow/core/tpu/kernels/tpu_program_group.h b/tensorflow/core/tpu/kernels/tpu_program_group.h
index de8256a9e59..0ade58e6daa 100644
--- a/tensorflow/core/tpu/kernels/tpu_program_group.h
+++ b/tensorflow/core/tpu/kernels/tpu_program_group.h
@@ -139,11 +139,15 @@ class TpuProgramGroup : public TpuProgramGroupInterface {
   const xla::HloProto& hlo_metadata() const { return hlo_metadata_; }
   void set_hlo_metadata(const xla::HloProto& hlo_metadata) {
     hlo_metadata_ = hlo_metadata;
+
+    // TODO(henrytan): initialize hlo_metadatas_ for multi program support.
+    if (hlo_metadatas_.empty()) {
+      hlo_metadatas_.push_back(&hlo_metadata_);
+    }
   }
 
   xla::HloProto hlo_metadata(int core_index) const;
-  std::vector<std::shared_ptr<const xla::HloProto>> hlo_metadatas()
-      const override;
+  absl::Span<const xla::HloProto* const> hlo_metadatas() const override;
 
  private:
   std::vector<bool> may_modify_variables_;
@@ -153,6 +157,7 @@ class TpuProgramGroup : public TpuProgramGroupInterface {
   TPUExecutableInfoProto executable_info_;
   TPUHostTransferInfoProto host_transfer_info_;
   xla::HloProto hlo_metadata_;
+  std::vector<const xla::HloProto*> hlo_metadatas_;
 };
 
 }  // namespace tpu
diff --git a/tensorflow/core/tpu/kernels/tpu_program_group_interface.h b/tensorflow/core/tpu/kernels/tpu_program_group_interface.h
index a4f74fb750d..8d8dd5a8786 100644
--- a/tensorflow/core/tpu/kernels/tpu_program_group_interface.h
+++ b/tensorflow/core/tpu/kernels/tpu_program_group_interface.h
@@ -44,9 +44,9 @@ class TpuProgramGroupInterface {
   // Logs program memory summary.
   virtual bool LogProgramMemorySummary() = 0;
 
-  // Hlo metadatas.
-  virtual std::vector<std::shared_ptr<const xla::HloProto>> hlo_metadatas()
-      const = 0;
+  // Hlo metadatas. The pointers can only be used as long as the cache entry is
+  // referenced.
+  virtual absl::Span<const xla::HloProto* const> hlo_metadatas() const = 0;
 
   // Boolean array to indicate if the modification of variables are
   // allowed.
diff --git a/tensorflow/core/tpu/kernels/tpu_util_c_api.h b/tensorflow/core/tpu/kernels/tpu_util_c_api.h
index 4d992449cfc..32b946d56c9 100644
--- a/tensorflow/core/tpu/kernels/tpu_util_c_api.h
+++ b/tensorflow/core/tpu/kernels/tpu_util_c_api.h
@@ -15,6 +15,7 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_TPU_KERNELS_TPU_UTIL_C_API_H_
 #define TENSORFLOW_CORE_TPU_KERNELS_TPU_UTIL_C_API_H_
 
+#include "tensorflow/core/tpu/libtftpu.h"
 #include "tensorflow/stream_executor/tpu/proto_helper.h"
 
 typedef struct SE_Status SE_Status;
@@ -32,4 +33,9 @@ void TpuCompile_ToTpuShapeRepresentation(
 
 }  // extern "C"
 
+struct TfTpu_UtilApiFn {
+  TFTPU_ADD_FN_IN_STRUCT(TpuCompile_IsTpuCompilationEnabled);
+  TFTPU_ADD_FN_IN_STRUCT(TpuCompile_ToTpuShapeRepresentation);
+};
+
 #endif  // TENSORFLOW_CORE_TPU_KERNELS_TPU_UTIL_C_API_H_
diff --git a/tensorflow/core/tpu/tpu_api.cc b/tensorflow/core/tpu/tpu_api.cc
new file mode 100644
index 00000000000..8dad82b3029
--- /dev/null
+++ b/tensorflow/core/tpu/tpu_api.cc
@@ -0,0 +1,57 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/tpu/tpu_api.h"
+
+namespace tensorflow {
+namespace tpu {
+
+TfTpu_BaseFn* InitializeApiFn() {
+  static TfTpu_BaseFn base_fn;
+  return &base_fn;
+}
+
+TfTpu_ConfigApiFn* ConfigApiFn() {
+  static TfTpu_ConfigApiFn config_api_fn;
+  return &config_api_fn;
+}
+
+TfTpu_MeshStateApiFn* MeshStateApiFn() {
+  static TfTpu_MeshStateApiFn mesh_state_api_fn;
+  return &mesh_state_api_fn;
+}
+
+TfTpu_CompileApiFn* CompileApiFn() {
+  static TfTpu_CompileApiFn compile_api_fn;
+  return &compile_api_fn;
+}
+
+TfTpu_ExecutorApiFn* ExecutorApiFn() {
+  static TfTpu_ExecutorApiFn executor_api_fn;
+  return &executor_api_fn;
+}
+
+TfTpu_NodeContextApiFn* NodeContextApiFn() {
+  static TfTpu_NodeContextApiFn node_context_api_fn;
+  return &node_context_api_fn;
+}
+
+TfTpu_UtilApiFn* UtilApiFn() {
+  static TfTpu_UtilApiFn util_api_fn;
+  return &util_api_fn;
+}
+
+}  // namespace tpu
+}  // namespace tensorflow
diff --git a/tensorflow/core/tpu/tpu_api.h b/tensorflow/core/tpu/tpu_api.h
new file mode 100644
index 00000000000..c47ace6601d
--- /dev/null
+++ b/tensorflow/core/tpu/tpu_api.h
@@ -0,0 +1,47 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_TPU_TPU_API_H_
+#define TENSORFLOW_CORE_TPU_TPU_API_H_
+
+#include "tensorflow/core/tpu/kernels/tpu_compile_c_api.h"
+#include "tensorflow/core/tpu/kernels/tpu_mesh_state_c_api.h"
+#include "tensorflow/core/tpu/kernels/tpu_util_c_api.h"
+#include "tensorflow/core/tpu/libtftpu.h"
+#include "tensorflow/core/tpu/tpu_config_c_api.h"
+#include "tensorflow/stream_executor/tpu/tpu_executor_c_api.h"
+#include "tensorflow/stream_executor/tpu/tpu_node_context_c_api.h"
+
+namespace tensorflow {
+namespace tpu {
+
+TfTpu_BaseFn* InitializeApiFn();
+
+TfTpu_ConfigApiFn* ConfigApiFn();
+
+TfTpu_MeshStateApiFn* MeshStateApiFn();
+
+TfTpu_CompileApiFn* CompileApiFn();
+
+TfTpu_ExecutorApiFn* ExecutorApiFn();
+
+TfTpu_NodeContextApiFn* NodeContextApiFn();
+
+TfTpu_UtilApiFn* UtilApiFn();
+
+}  // namespace tpu
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TPU_TPU_API_H_
diff --git a/tensorflow/core/tpu/tpu_library_loader.cc b/tensorflow/core/tpu/tpu_api_dlsym_initializer.cc
similarity index 50%
rename from tensorflow/core/tpu/tpu_library_loader.cc
rename to tensorflow/core/tpu/tpu_api_dlsym_initializer.cc
index c89de142a9f..c6666421327 100644
--- a/tensorflow/core/tpu/tpu_library_loader.cc
+++ b/tensorflow/core/tpu/tpu_api_dlsym_initializer.cc
@@ -13,16 +13,22 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/tpu/tpu_library_loader.h"
+#include "tensorflow/core/tpu/tpu_api_dlsym_initializer.h"
 
 #include <dlfcn.h>
 
-#define TFTPU_SET_FN(Struct, FnName) \
-  Struct->FnName##Fn =               \
-      reinterpret_cast<decltype(FnName)*>(dlsym(library_handle, #FnName));
-
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/tpu/tpu_api.h"
+#include "tensorflow/stream_executor/tpu/tpu_node_context_c_api.h"
+#include "tensorflow/stream_executor/tpu/tpu_platform.h"
+
+#define TFTPU_SET_FN(Struct, FnName)                                       \
+  Struct->FnName##Fn =                                                     \
+      reinterpret_cast<decltype(FnName)*>(dlsym(library_handle, #FnName)); \
+  if (!(Struct->FnName##Fn)) {                                             \
+    LOG(ERROR) << #FnName " not available in this library.";               \
+  }
 
 // Reminder: Update tpu_library_loader_windows.cc if you are adding new publicly
 // visible methods.
@@ -30,52 +36,25 @@ limitations under the License.
 namespace tensorflow {
 namespace tpu {
 
-Status SetTpuInitializeStructFns(void* library_handle) {
-  auto* base_fn = InitializeApiFn();
-
-  TFTPU_SET_FN(base_fn, TfTpu_Initialize);
-
-  return Status::OK();
-}
-
-Status SetTpuConfigStructFns(void* library_handle) {
-  auto* config_fn = ConfigApiFn();
-
-  TFTPU_SET_FN(config_fn, ConfigureDistributedTpuOp_DoWork);
-  TFTPU_SET_FN(config_fn, WaitForDistributedTpuOp_DoWork);
-  TFTPU_SET_FN(config_fn, ShutdownDistributedTpuOp_DoWork);
-  TFTPU_SET_FN(config_fn, InitializeHostForDistributedTpuOp_DoWork);
-  TFTPU_SET_FN(config_fn, SetGlobalTPUArrayOp_DoWork);
-  TFTPU_SET_FN(config_fn, DisconnectDistributedTpuChipsOp_DoWork);
-  TFTPU_SET_FN(config_fn, TpuConfigurationApi_FreeCharArray);
-  TFTPU_SET_FN(config_fn, TpuConfigurationApi_FreeInt32Array);
-
-  return Status::OK();
-}
-
-TfTpu_BaseFn* InitializeApiFn() {
-  static TfTpu_BaseFn base_fn;
-  return &base_fn;
-}
-
-TfTpu_ConfigApiFn* ConfigApiFn() {
-  static TfTpu_ConfigApiFn config_api_fn;
-  return &config_api_fn;
-}
+#include "tensorflow/core/tpu/tpu_library_init_fns.inc"
 
 Status InitializeTpuLibrary(void* library_handle) {
   bool shared_object_loaded = true;
   if (library_handle == nullptr) {
-    library_handle = dlopen(nullptr, RTLD_LAZY);
+    library_handle = dlopen(nullptr, RTLD_NOW);
     shared_object_loaded = false;
   }
 
-  TF_RETURN_IF_ERROR(SetTpuInitializeStructFns(library_handle));
-  TF_RETURN_IF_ERROR(SetTpuConfigStructFns(library_handle));
+  TF_RETURN_IF_ERROR(InitializeTpuStructFns(library_handle));
 
   if (shared_object_loaded) {
+    // TODO(frankchn): Make initialization actually work
     // Initialize TPU platform when the platform code is loaded from a library.
-    InitializeApiFn()->TfTpu_InitializeFn();
+    // InitializeApiFn()->TfTpu_InitializeFn();
+
+    // We should only register the TPU platform when the library is loaded.
+    // TODO(frankchn): Resolve the circular dependency and register the platform
+    // RegisterTpuPlatform();
   }
 
   return Status::OK();
diff --git a/tensorflow/core/tpu/tpu_library_loader.h b/tensorflow/core/tpu/tpu_api_dlsym_initializer.h
similarity index 63%
rename from tensorflow/core/tpu/tpu_library_loader.h
rename to tensorflow/core/tpu/tpu_api_dlsym_initializer.h
index a51948cf719..257fa25ad37 100644
--- a/tensorflow/core/tpu/tpu_library_loader.h
+++ b/tensorflow/core/tpu/tpu_api_dlsym_initializer.h
@@ -13,12 +13,17 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CORE_TPU_TPU_LIBRARY_LOADER_H_
-#define TENSORFLOW_CORE_TPU_TPU_LIBRARY_LOADER_H_
+#ifndef TENSORFLOW_CORE_TPU_TPU_API_DLSYM_INITIALIZER_H_
+#define TENSORFLOW_CORE_TPU_TPU_API_DLSYM_INITIALIZER_H_
 
 #include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/tpu/kernels/tpu_compile_c_api.h"
+#include "tensorflow/core/tpu/kernels/tpu_mesh_state_c_api.h"
+#include "tensorflow/core/tpu/kernels/tpu_util_c_api.h"
 #include "tensorflow/core/tpu/libtftpu.h"
 #include "tensorflow/core/tpu/tpu_config_c_api.h"
+#include "tensorflow/stream_executor/tpu/tpu_executor_c_api.h"
+#include "tensorflow/stream_executor/tpu/tpu_node_context_c_api.h"
 
 // LINT.IfChange
 namespace tensorflow {
@@ -26,12 +31,8 @@ namespace tpu {
 
 Status InitializeTpuLibrary(void* library_handle);
 
-TfTpu_BaseFn* InitializeApiFn();
-
-TfTpu_ConfigApiFn* ConfigApiFn();
-
 }  // namespace tpu
 }  // namespace tensorflow
-// LINT.ThenChange(//tensorflow/core/tpu/tpu_library_loader_windows.cc)
+// LINT.ThenChange(//tensorflow/core/tpu/tpu_api_dlsym_initializer_windows.cc)
 
-#endif  // TENSORFLOW_CORE_TPU_TPU_LIBRARY_LOADER_H_
+#endif  // TENSORFLOW_CORE_TPU_TPU_API_DLSYM_INITIALIZER_H_
diff --git a/tensorflow/core/tpu/tpu_library_loader_windows.cc b/tensorflow/core/tpu/tpu_api_dlsym_initializer_windows.cc
similarity index 79%
rename from tensorflow/core/tpu/tpu_library_loader_windows.cc
rename to tensorflow/core/tpu/tpu_api_dlsym_initializer_windows.cc
index e7c25df415e..f453a98e558 100644
--- a/tensorflow/core/tpu/tpu_library_loader_windows.cc
+++ b/tensorflow/core/tpu/tpu_api_dlsym_initializer_windows.cc
@@ -15,18 +15,14 @@ limitations under the License.
 
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/status.h"
-#include "tensorflow/core/tpu/tpu_library_loader.h"
+#include "tensorflow/core/tpu/tpu_api_dlsym_initializer.h"
 
-// Reminder: Update tpu_library_loader.cc if you are adding new publicly
-// visible methods.
+// Reminder: Update tpu_api_dlsym_initializer_windows.cc if you are adding new
+// publicly visible methods.
 
 namespace tensorflow {
 namespace tpu {
 
-TfTpu_BaseFn* InitializeApiFn() { return nullptr; }
-
-TfTpu_ConfigApiFn* ConfigApiFn() { return nullptr; }
-
 Status InitializeTpuLibrary(void* library_handle) {
   return errors::Unimplemented(
       "Loading TPU library is not supported on Windows.");
diff --git a/tensorflow/core/tpu/tpu_library_init_fns.inc b/tensorflow/core/tpu/tpu_library_init_fns.inc
new file mode 100644
index 00000000000..e21d7f195ad
--- /dev/null
+++ b/tensorflow/core/tpu/tpu_library_init_fns.inc
@@ -0,0 +1,166 @@
+namespace {
+
+tensorflow::Status SetTpuConfigStructFns(void* library_handle) {
+  auto* config_fn = tensorflow::tpu::ConfigApiFn();
+
+  TFTPU_SET_FN(config_fn, ConfigureDistributedTpuOp_DoWork);
+  TFTPU_SET_FN(config_fn, WaitForDistributedTpuOp_DoWork);
+  TFTPU_SET_FN(config_fn, ShutdownDistributedTpuOp_DoWork);
+  TFTPU_SET_FN(config_fn, InitializeHostForDistributedTpuOp_DoWork);
+  TFTPU_SET_FN(config_fn, SetGlobalTPUArrayOp_DoWork);
+  TFTPU_SET_FN(config_fn, DisconnectDistributedTpuChipsOp_DoWork);
+  TFTPU_SET_FN(config_fn, TpuConfigurationApi_FreeCharArray);
+  TFTPU_SET_FN(config_fn, TpuConfigurationApi_FreeInt32Array);
+
+  return tensorflow::Status::OK();
+}
+
+tensorflow::Status SetTpuMeshStateStructFns(void* library_handle) {
+  auto* mesh_state_fn = tensorflow::tpu::MeshStateApiFn();
+
+  TFTPU_SET_FN(mesh_state_fn, TpuMeshState_Create);
+  TFTPU_SET_FN(mesh_state_fn, TpuMeshState_Free);
+  TFTPU_SET_FN(mesh_state_fn, TpuMeshState_MeshCommonState);
+
+  return tensorflow::Status::OK();
+}
+
+tensorflow::Status SetCompileStructFn(void* library_handle) {
+  auto* compile_fn = tensorflow::tpu::CompileApiFn();
+
+  TFTPU_SET_FN(compile_fn, TpuTopology_AvailableCoreCount);
+  TFTPU_SET_FN(compile_fn, TpuCompile_CreateCompilationCacheKey);
+  TFTPU_SET_FN(compile_fn, TpuCompile_CreateGuaranteedConstFingerprint);
+  TFTPU_SET_FN(compile_fn, TpuCompile_CompileAheadOfTime);
+  TFTPU_SET_FN(compile_fn, TpuCompile_BuildXLADeviceAssignment);
+
+  return tensorflow::Status::OK();
+}
+
+tensorflow::Status SetExecutorStructFn(void* library_handle) {
+  auto* executor_fn = tensorflow::tpu::ExecutorApiFn();
+
+  TFTPU_SET_FN(executor_fn, TpuPlatform_New);
+  TFTPU_SET_FN(executor_fn, TpuPlatform_Free);
+  TFTPU_SET_FN(executor_fn, TpuPlatform_Initialize);
+  TFTPU_SET_FN(executor_fn, TpuPlatform_Initialized);
+  TFTPU_SET_FN(executor_fn, TpuPlatform_GetExecutor);
+  TFTPU_SET_FN(executor_fn, TpuPlatform_Id);
+  TFTPU_SET_FN(executor_fn, TpuPlatform_VisibleDeviceCount);
+  TFTPU_SET_FN(executor_fn, TpuPlatform_TpuMemoryLimit);
+  TFTPU_SET_FN(executor_fn, TpuPlatform_ShouldRegisterTpuDeviceToDeviceCopy);
+
+  TFTPU_SET_FN(executor_fn, TpuExecutor_Init);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_Free);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_PlatformDeviceCount);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_Allocate);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_Deallocate);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_GetAllocatorStats);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_DeviceMemoryUsage);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_AllocateStream);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_DeallocateStream);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_CreateStreamDependency);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_GetStatus);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_AllocateEvent);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_DeallocateEvent);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_PollForEventStatus);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_RecordEvent);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_WaitForEvent);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_AllocateTimer);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_DeallocateTimer);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_StartTimer);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_StopTimer);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_SynchronousMemcpyToHost);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_SynchronousMemcpyFromHost);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_MemcpyToHost);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_MemcpyFromHost);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_EnqueueInfeed);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_DequeueOutfeed);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_WaitForInfeedReady);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_WaitForOutfeedReady);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_BlockHostUntilDone);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_BlockUntilDoneOrFailed);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_SyncAndForgetFailedStreams);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_SynchronizeAllActivity);
+
+  TFTPU_SET_FN(executor_fn, TpuStream_New);
+  TFTPU_SET_FN(executor_fn, TpuStream_Free);
+  TFTPU_SET_FN(executor_fn, TpuStream_Stream);
+  TFTPU_SET_FN(executor_fn, TpuStream_Status);
+  TFTPU_SET_FN(executor_fn, TpuStream_IsSameSharedMemoryLocation);
+  TFTPU_SET_FN(executor_fn, TpuStream_TpuEnqueueOnDeviceSendRecvLocal);
+
+  TFTPU_SET_FN(executor_fn, TpuEvent_New);
+  TFTPU_SET_FN(executor_fn, TpuEvent_Free);
+
+  TFTPU_SET_FN(executor_fn, TpuTimer_New);
+  TFTPU_SET_FN(executor_fn, TpuTimer_Free);
+  TFTPU_SET_FN(executor_fn, TpuTimer_Nanoseconds);
+  TFTPU_SET_FN(executor_fn, TpuTimer_Microseconds);
+
+  TFTPU_SET_FN(executor_fn, TpuStatus_New);
+  TFTPU_SET_FN(executor_fn, TpuStatus_Create);
+  TFTPU_SET_FN(executor_fn, TpuStatus_Free);
+  TFTPU_SET_FN(executor_fn, TpuStatus_Message);
+  TFTPU_SET_FN(executor_fn, TpuStatus_Code);
+  TFTPU_SET_FN(executor_fn, TpuStatus_Ok);
+
+  TFTPU_SET_FN(executor_fn, TpuStreamExecutorConfig_Default);
+  TFTPU_SET_FN(executor_fn, TpuStreamExecutorConfig_SetOrdinal);
+  TFTPU_SET_FN(executor_fn, TpuStreamExecutorConfig_Free);
+
+  TFTPU_SET_FN(executor_fn, TpuDeviceDescription_New);
+  TFTPU_SET_FN(executor_fn, TpuDeviceDescription_Free);
+
+  TFTPU_SET_FN(executor_fn, TpuExecutor_CreateDeviceDescription);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_NewDeviceOptions);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_FreeDeviceOptions);
+  TFTPU_SET_FN(executor_fn, TpuExecutor_HostCallback);
+
+  TFTPU_SET_FN(executor_fn, TpuTransferManager_New);
+  TFTPU_SET_FN(executor_fn, TpuTransferManager_Free);
+  TFTPU_SET_FN(executor_fn, TpuTransferManager_PlatformId);
+  TFTPU_SET_FN(executor_fn, TpuTransferManager_HostShapeToDeviceShape);
+  TFTPU_SET_FN(executor_fn, TpuTransferManager_TransferLiteralToDeviceAsync);
+  TFTPU_SET_FN(executor_fn, TpuTransferManager_TransferLiteralFromDevice);
+  TFTPU_SET_FN(executor_fn, TpuTransferManager_GetByteSizeRequirement);
+  TFTPU_SET_FN(executor_fn, TpuTransferManager_WriteSingleTupleIndexTable);
+
+  TFTPU_SET_FN(executor_fn, TpuComputationPlacer_New);
+  TFTPU_SET_FN(executor_fn, TpuComputationPlacer_Free);
+
+  return tensorflow::Status::OK();
+}
+
+tensorflow::Status SetTpuNodeContextStructFns(void* library_handle) {
+  auto* node_context_fn = tensorflow::tpu::NodeContextApiFn();
+
+  TFTPU_SET_FN(node_context_fn, TpuNodeContext_Create);
+  TFTPU_SET_FN(node_context_fn, TpuNodeContext_Free);
+  TFTPU_SET_FN(node_context_fn, TpuNodeContext_StopChipHeartbeats);
+  TFTPU_SET_FN(node_context_fn, TpuNodeContext_CloseTpuHost);
+
+  return tensorflow::Status::OK();
+}
+
+tensorflow::Status SetTpuUtilStructFns(void* library_handle) {
+  auto* util_fn = tensorflow::tpu::UtilApiFn();
+
+  TFTPU_SET_FN(util_fn, TpuCompile_IsTpuCompilationEnabled);
+  TFTPU_SET_FN(util_fn, TpuCompile_ToTpuShapeRepresentation);
+
+  return tensorflow::Status::OK();
+}
+
+tensorflow::Status InitializeTpuStructFns(void* library_handle) {
+  TF_RETURN_IF_ERROR(SetTpuConfigStructFns(library_handle));
+  TF_RETURN_IF_ERROR(SetTpuMeshStateStructFns(library_handle));
+  TF_RETURN_IF_ERROR(SetCompileStructFn(library_handle));
+  TF_RETURN_IF_ERROR(SetExecutorStructFn(library_handle));
+  TF_RETURN_IF_ERROR(SetTpuNodeContextStructFns(library_handle));
+  TF_RETURN_IF_ERROR(SetTpuUtilStructFns(library_handle));
+
+  return tensorflow::Status::OK();
+}
+
+}  // namespace
\ No newline at end of file
diff --git a/tensorflow/core/util/batch_util.cc b/tensorflow/core/util/batch_util.cc
index b88c365ced0..e03188b04da 100644
--- a/tensorflow/core/util/batch_util.cc
+++ b/tensorflow/core/util/batch_util.cc
@@ -182,8 +182,6 @@ Status CopyElementToSlice(Tensor element, Tensor* parent, int64 index) {
   switch (element.dtype()) {
     TF_CALL_ALL_TYPES(HANDLE_TYPE);
     TF_CALL_QUANTIZED_TYPES(HANDLE_TYPE);
-    TF_CALL_uint32(HANDLE_TYPE);
-    TF_CALL_uint64(HANDLE_TYPE);
 #undef HANDLE_TYPE
     default:
       return errors::Unimplemented("CopyElementToSlice Unhandled data type: ",
@@ -207,8 +205,6 @@ Status CopySliceToElement(const Tensor& parent, Tensor* element, int64 index) {
   switch (parent.dtype()) {
     TF_CALL_ALL_TYPES(HANDLE_TYPE);
     TF_CALL_QUANTIZED_TYPES(HANDLE_TYPE);
-    TF_CALL_uint32(HANDLE_TYPE);
-    TF_CALL_uint64(HANDLE_TYPE);
 #undef HANDLE_TYPE
     default:
       return errors::Unimplemented("CopySliceToElement Unhandled data type: ",
@@ -280,8 +276,6 @@ Status CopyContiguousSlices(const Tensor& src, int64 src_offset,
   switch (src.dtype()) {
     TF_CALL_ALL_TYPES(HANDLE_TYPE);
     TF_CALL_QUANTIZED_TYPES(HANDLE_TYPE);
-    TF_CALL_uint32(HANDLE_TYPE);
-    TF_CALL_uint64(HANDLE_TYPE);
 #undef HANDLE_TYPE
     default:
       return errors::Unimplemented("CopyContiguousSlices unhandled data type: ",
@@ -308,8 +302,6 @@ Status MaybeMoveSliceToElement(Tensor* parent, Tensor* element, int64 index) {
   switch (parent->dtype()) {
     TF_CALL_ALL_TYPES(HANDLE_TYPE);
     TF_CALL_QUANTIZED_TYPES(HANDLE_TYPE);
-    TF_CALL_uint32(HANDLE_TYPE);
-    TF_CALL_uint64(HANDLE_TYPE);
 #undef HANDLE_TYPE
     default:
       return errors::Unimplemented(
diff --git a/tensorflow/core/util/gpu_device_functions.h b/tensorflow/core/util/gpu_device_functions.h
index de55b5c33c4..c43b99ac452 100644
--- a/tensorflow/core/util/gpu_device_functions.h
+++ b/tensorflow/core/util/gpu_device_functions.h
@@ -53,6 +53,8 @@ using gpuEvent_t = cudaEvent_t;
 #define gpuEventCreate cudaEventCreate
 #define gpuEventCreateWithFlags cudaEventCreateWithFlags
 #define gpuEventDisableTiming cudaEventDisableTiming
+#define gpuDeviceSynchronize cudaDeviceSynchronize
+#define gpuFree cudaFree
 #elif TENSORFLOW_USE_ROCM
 using gpuFloatComplex = hipFloatComplex;
 using gpuDoubleComplex = hipDoubleComplex;
@@ -68,6 +70,8 @@ using cudaError_t = int;
 #define gpuEventCreate hipEventCreate
 #define gpuEventCreateWithFlags hipEventCreateWithFlags
 #define gpuEventDisableTiming hipEventDisableTiming
+#define gpuDeviceSynchronize hipDeviceSynchronize
+#define gpuFree hipFree
 static std::string cudaGetErrorString(int err) { return std::to_string(err); }
 #endif
 
@@ -820,6 +824,11 @@ __device__ inline tensorflow::uint64 GpuAtomicMax(tensorflow::uint64* ptr,
   return detail::GpuAtomicCasHelper(
       ptr, [value](tensorflow::uint64 a) { return max(a, value); });
 }
+
+__device__ inline int64 GpuAtomicMax(int64* ptr, int64 value) {
+  return detail::GpuAtomicCasHelper(ptr,
+                                    [value](int64 a) { return max(a, value); });
+}
 #endif
 CREATE_CUDA_DEVICE_FUNCTION_ALIAS(GpuAtomicMax, CudaAtomicMax);
 
@@ -885,6 +894,11 @@ __device__ inline tensorflow::uint64 GpuAtomicMin(tensorflow::uint64* ptr,
   return detail::GpuAtomicCasHelper(
       ptr, [value](tensorflow::uint64 a) { return min(a, value); });
 }
+
+__device__ inline int64 GpuAtomicMin(int64* ptr, int64 value) {
+  return detail::GpuAtomicCasHelper(ptr,
+                                    [value](int64 a) { return min(a, value); });
+}
 #endif
 CREATE_CUDA_DEVICE_FUNCTION_ALIAS(GpuAtomicMin, CudaAtomicMin);
 
diff --git a/tensorflow/core/util/gpu_kernel_helper_test.cu.cc b/tensorflow/core/util/gpu_kernel_helper_test.cu.cc
index c089511e964..b876d0890f0 100644
--- a/tensorflow/core/util/gpu_kernel_helper_test.cu.cc
+++ b/tensorflow/core/util/gpu_kernel_helper_test.cu.cc
@@ -13,9 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #define EIGEN_USE_GPU
 
+#include <time.h>
+
 #include <numeric>
 
 #include "tensorflow/core/lib/core/status_test_util.h"
@@ -25,14 +27,14 @@ limitations under the License.
 
 #define CUDA_EXPECT_SUCCESS                                 \
   {                                                         \
-    cudaDeviceSynchronize();                                \
+    gpuDeviceSynchronize();                                 \
     cudaError_t err = cudaGetLastError();                   \
     EXPECT_EQ(cudaSuccess, err) << cudaGetErrorString(err); \
   }
 
 #define CUDA_ASSERT_SUCCESS                                 \
   {                                                         \
-    cudaDeviceSynchronize();                                \
+    gpuDeviceSynchronize();                                 \
     cudaError_t err = cudaGetLastError();                   \
     ASSERT_EQ(cudaSuccess, err) << cudaGetErrorString(err); \
   }
@@ -94,8 +96,7 @@ __global__ void Count3D(Gpu3DLaunchConfig config, int bufsize,
   }
 }
 
-__global__ void CudaShuffleGetSrcLaneTest(
-    unsigned* __restrict__ failure_count) {
+__global__ void GpuShuffleGetSrcLaneTest(unsigned* __restrict__ failure_count) {
   unsigned lane_id = GpuLaneId();
   for (int width = warpSize; width > 1; width /= 2) {
     auto check_result = [&](const char* op_name, int param, unsigned actual,
@@ -103,31 +104,38 @@ __global__ void CudaShuffleGetSrcLaneTest(
       if (actual != expected) {
         printf("Cuda%sGetSrcLane(%d, %d) for lane %d returned %d, not %d\n",
                op_name, param, width, lane_id, actual, expected);
-        CudaAtomicAdd(failure_count, 1);
+        GpuAtomicAdd(failure_count, 1);
       }
     };
+
     for (int src_lane = -warpSize; src_lane <= warpSize; ++src_lane) {
-      unsigned actual_lane = detail::CudaShuffleGetSrcLane(src_lane, width);
+#if TENSORFLOW_USE_ROCM
+      if (src_lane < 0 || src_lane >= width) continue;
+#endif
+      unsigned actual_lane = detail::GpuShuffleGetSrcLane(src_lane, width);
       unsigned expect_lane =
-          CudaShuffleSync(kCudaWarpAll, lane_id, src_lane, width);
+          GpuShuffleSync(kCudaWarpAll, lane_id, src_lane, width);
       check_result("Shuffle", src_lane, actual_lane, expect_lane);
     }
+
     for (unsigned delta = 0; delta <= warpSize; ++delta) {
-      unsigned actual_lane = detail::CudaShuffleUpGetSrcLane(delta, width);
+      unsigned actual_lane = detail::GpuShuffleUpGetSrcLane(delta, width);
       unsigned expect_lane =
-          CudaShuffleUpSync(kCudaWarpAll, lane_id, delta, width);
+          GpuShuffleUpSync(kCudaWarpAll, lane_id, delta, width);
       check_result("ShuffleUp", delta, actual_lane, expect_lane);
     }
+
     for (unsigned delta = 0; delta <= warpSize; ++delta) {
-      unsigned actual_lane = detail::CudaShuffleDownGetSrcLane(delta, width);
+      unsigned actual_lane = detail::GpuShuffleDownGetSrcLane(delta, width);
       unsigned expect_lane =
-          CudaShuffleDownSync(kCudaWarpAll, lane_id, delta, width);
+          GpuShuffleDownSync(kCudaWarpAll, lane_id, delta, width);
       check_result("ShuffleDown", delta, actual_lane, expect_lane);
     }
+
     for (int lane_lane = warpSize; lane_lane > 0; lane_lane /= 2) {
-      unsigned actual_lane = detail::CudaShuffleXorGetSrcLane(lane_lane, width);
+      unsigned actual_lane = detail::GpuShuffleXorGetSrcLane(lane_lane, width);
       unsigned expect_lane =
-          CudaShuffleXorSync(kCudaWarpAll, lane_id, lane_lane, width);
+          GpuShuffleXorSync(kCudaWarpAll, lane_id, lane_lane, width);
       check_result("ShuffleXor", lane_lane, actual_lane, expect_lane);
     }
   }
@@ -137,19 +145,32 @@ __global__ void CudaShuffleGetSrcLaneTest(
 
 class GpuLaunchConfigTest : public ::testing::Test {
  protected:
-  const int bufsize = 1024;
+  static const int bufsize = 1024;
   int* outbuf = nullptr;
+  int* outbuf_host = nullptr;
+  int hostbuf[bufsize];
   Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice d = Eigen::GpuDevice(&stream);
 
+  void copyToHost() {
+#if TENSORFLOW_USE_ROCM
+    hipMemcpy(hostbuf, outbuf, sizeof(int) * bufsize, hipMemcpyDeviceToHost);
+#endif
+  }
   virtual void SetUp() {
+#if GOOGLE_CUDA
     cudaError_t err = cudaMallocManaged(&outbuf, sizeof(int) * bufsize);
+    outbuf_host = outbuf;
+#else
+    cudaError_t err = hipMalloc(&outbuf, sizeof(int) * bufsize);
+    outbuf_host = hostbuf;
+#endif
     ASSERT_EQ(cudaSuccess, err) << cudaGetErrorString(err);
   }
 
   virtual void TearDown() {
-    cudaDeviceSynchronize();
-    cudaFree(outbuf);
+    gpuDeviceSynchronize();
+    gpuFree(outbuf);
     outbuf = nullptr;
   }
 };
@@ -158,28 +179,32 @@ TEST_F(GpuLaunchConfigTest, GetGpuLaunchConfig) {
   GpuLaunchConfig cfg;
 
 // test valid inputs
-#define TEST_LAUNCH_PARAMETER(work_element_count)                              \
-  cfg = GetGpuLaunchConfig(bufsize, d);                                        \
-  TF_CHECK_OK(GpuLaunchKernel(SetOutbufZero, cfg.block_count,                  \
-                              cfg.thread_per_block, 0, d.stream(), cfg,        \
-                              outbuf));                                        \
-  CUDA_ASSERT_SUCCESS                                                          \
-  cfg = GetGpuLaunchConfig(work_element_count, d);                             \
-  TF_CHECK_OK(GpuLaunchKernel(Count1D, cfg.block_count, cfg.thread_per_block,  \
-                              0, d.stream(), cfg, bufsize, outbuf));           \
-  CUDA_EXPECT_SUCCESS                                                          \
-  EXPECT_EQ(work_element_count, std::accumulate(outbuf, outbuf + bufsize, 0)); \
-                                                                               \
-  cfg = GetGpuLaunchConfig(bufsize, d, SetOutbufZero, 0, 0);                   \
-  TF_CHECK_OK(GpuLaunchKernel(SetOutbufZero, cfg.block_count,                  \
-                              cfg.thread_per_block, 0, d.stream(), cfg,        \
-                              outbuf));                                        \
-  CUDA_ASSERT_SUCCESS                                                          \
-  cfg = GetGpuLaunchConfig(work_element_count, d, Count1D, 0, 0);              \
-  TF_CHECK_OK(GpuLaunchKernel(Count1D, cfg.block_count, cfg.thread_per_block,  \
-                              0, d.stream(), cfg, bufsize, outbuf));           \
-  CUDA_EXPECT_SUCCESS                                                          \
-  EXPECT_EQ(work_element_count, std::accumulate(outbuf, outbuf + bufsize, 0))
+#define TEST_LAUNCH_PARAMETER(work_element_count)                             \
+  cfg = GetGpuLaunchConfig(bufsize, d);                                       \
+  TF_CHECK_OK(GpuLaunchKernel(SetOutbufZero, cfg.block_count,                 \
+                              cfg.thread_per_block, 0, d.stream(), cfg,       \
+                              outbuf));                                       \
+  CUDA_ASSERT_SUCCESS                                                         \
+  cfg = GetGpuLaunchConfig(work_element_count, d);                            \
+  TF_CHECK_OK(GpuLaunchKernel(Count1D, cfg.block_count, cfg.thread_per_block, \
+                              0, d.stream(), cfg, bufsize, outbuf));          \
+  CUDA_EXPECT_SUCCESS                                                         \
+  copyToHost();                                                               \
+  EXPECT_EQ(work_element_count,                                               \
+            std::accumulate(outbuf_host, outbuf_host + bufsize, 0));          \
+                                                                              \
+  cfg = GetGpuLaunchConfig(bufsize, d, SetOutbufZero, 0, 0);                  \
+  TF_CHECK_OK(GpuLaunchKernel(SetOutbufZero, cfg.block_count,                 \
+                              cfg.thread_per_block, 0, d.stream(), cfg,       \
+                              outbuf));                                       \
+  CUDA_ASSERT_SUCCESS                                                         \
+  cfg = GetGpuLaunchConfig(work_element_count, d, Count1D, 0, 0);             \
+  TF_CHECK_OK(GpuLaunchKernel(Count1D, cfg.block_count, cfg.thread_per_block, \
+                              0, d.stream(), cfg, bufsize, outbuf));          \
+  CUDA_EXPECT_SUCCESS                                                         \
+  copyToHost();                                                               \
+  EXPECT_EQ(work_element_count,                                               \
+            std::accumulate(outbuf_host, outbuf_host + bufsize, 0));
 
   TEST_LAUNCH_PARAMETER(128);
   TEST_LAUNCH_PARAMETER(129);
@@ -221,7 +246,9 @@ TEST_F(GpuLaunchConfigTest, GetGpu2DLaunchConfig) {
   TF_EXPECT_OK(GpuLaunchKernel(Count2D, cfg.block_count, cfg.thread_per_block, \
                                0, d.stream(), cfg, bufsize, outbuf));          \
   CUDA_EXPECT_SUCCESS                                                          \
-  EXPECT_EQ(dimx* dimy, std::accumulate(outbuf, outbuf + bufsize, 0));         \
+  copyToHost();                                                                \
+  EXPECT_EQ(dimx* dimy,                                                        \
+            std::accumulate(outbuf_host, outbuf_host + bufsize, 0));           \
                                                                                \
   cfg1d = GetGpuLaunchConfig(bufsize, d, SetOutbufZero, 0, 0);                 \
   TF_EXPECT_OK(GpuLaunchKernel(SetOutbufZero, cfg1d.block_count,               \
@@ -232,7 +259,8 @@ TEST_F(GpuLaunchConfigTest, GetGpu2DLaunchConfig) {
   TF_EXPECT_OK(GpuLaunchKernel(Count2D, cfg.block_count, cfg.thread_per_block, \
                                0, d.stream(), cfg, bufsize, outbuf));          \
   CUDA_EXPECT_SUCCESS                                                          \
-  EXPECT_EQ(dimx* dimy, std::accumulate(outbuf, outbuf + bufsize, 0))
+  copyToHost();                                                                \
+  EXPECT_EQ(dimx* dimy, std::accumulate(outbuf_host, outbuf_host + bufsize, 0))
 
   TEST_LAUNCH_PARAMETER(128, 128);
   TEST_LAUNCH_PARAMETER(129, 64);
@@ -263,7 +291,9 @@ TEST_F(GpuLaunchConfigTest, GetGpu3DLaunchConfig) {
   TF_EXPECT_OK(GpuLaunchKernel(Count3D, cfg.block_count, cfg.thread_per_block, \
                                0, d.stream(), cfg, bufsize, outbuf));          \
   CUDA_EXPECT_SUCCESS                                                          \
-  EXPECT_EQ(dimx* dimy* dimz, std::accumulate(outbuf, outbuf + bufsize, 0))
+  copyToHost();                                                                \
+  EXPECT_EQ(dimx* dimy* dimz,                                                  \
+            std::accumulate(outbuf_host, outbuf_host + bufsize, 0))
 
   TEST_LAUNCH_PARAMETER(128, 128, 128);
   TEST_LAUNCH_PARAMETER(129, 64, 1024);
@@ -282,15 +312,19 @@ TEST_F(GpuLaunchConfigTest, GetGpu3DLaunchConfig) {
 
 TEST(CudaDeviceFunctionsTest, ShuffleGetSrcLane) {
   unsigned* failure_count;
+#if GOOGLE_CUDA
   ASSERT_EQ(cudaMallocManaged(&failure_count, sizeof(unsigned)), cudaSuccess);
+#else
+  ASSERT_EQ(hipHostMalloc(&failure_count, sizeof(unsigned), 0), cudaSuccess);
+#endif
   *failure_count = 0;
-  TF_EXPECT_OK(GpuLaunchKernel(CudaShuffleGetSrcLaneTest, 1, 32, 0, nullptr,
-                               failure_count));
-  ASSERT_EQ(cudaDeviceSynchronize(), cudaSuccess);
+  TF_EXPECT_OK(GpuLaunchKernel(GpuShuffleGetSrcLaneTest, 1, TF_RED_WARPSIZE, 0,
+                               nullptr, failure_count));
+  ASSERT_EQ(gpuDeviceSynchronize(), cudaSuccess);
   ASSERT_EQ(*failure_count, 0);
-  cudaFree(failure_count);
+  gpuFree(failure_count);
 }
 
 }  // namespace tensorflow
 
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/util/mkl_util.h b/tensorflow/core/util/mkl_util.h
index 29bf2d6b020..a4a3f5ff778 100644
--- a/tensorflow/core/util/mkl_util.h
+++ b/tensorflow/core/util/mkl_util.h
@@ -1543,17 +1543,21 @@ class MklDnnData {
   }
 
   /// Set function for data buffer of user memory primitive.
-  inline void SetUsrMemDataHandle(void* data_buffer) {
+  inline void SetUsrMemDataHandle(void* data_buffer,
+                                  std::shared_ptr<stream> t_stream = nullptr) {
     CHECK_NOTNULL(user_memory_);
     CHECK_NOTNULL(data_buffer);
+#ifdef ENABLE_MKLDNN_THREADPOOL
+    user_memory_->set_data_handle(data_buffer, *t_stream);
+#else
     user_memory_->set_data_handle(data_buffer);
+#endif  // ENABLE_MKLDNN_THREADPOOL
   }
 
   /// Set function for data buffer of user memory primitive.
-  inline void SetUsrMemDataHandle(const Tensor* tensor) {
-    CHECK_NOTNULL(user_memory_);
-    CHECK_NOTNULL(tensor);
-    user_memory_->set_data_handle(GetTensorBuffer(tensor));
+  inline void SetUsrMemDataHandle(const Tensor* tensor,
+                                  std::shared_ptr<stream> t_stream = nullptr) {
+    SetUsrMemDataHandle(GetTensorBuffer(tensor), t_stream);
   }
 
   /// allocate function for data buffer
diff --git a/tensorflow/core/util/saved_tensor_slice_util.h b/tensorflow/core/util/saved_tensor_slice_util.h
index 09b9235b711..1f9768f5163 100644
--- a/tensorflow/core/util/saved_tensor_slice_util.h
+++ b/tensorflow/core/util/saved_tensor_slice_util.h
@@ -116,7 +116,9 @@ TENSOR_PROTO_EXTRACT_TYPE(double, double, double);
 TENSOR_PROTO_EXTRACT_TYPE_COMPLEX(complex64, scomplex, float);
 TENSOR_PROTO_EXTRACT_TYPE_COMPLEX(complex128, dcomplex, double);
 TENSOR_PROTO_EXTRACT_TYPE(int32, int, int32);
+TENSOR_PROTO_EXTRACT_TYPE(uint32, uint32, uint32);
 TENSOR_PROTO_EXTRACT_TYPE(int64, int64, protobuf_int64);
+TENSOR_PROTO_EXTRACT_TYPE(uint64, uint64, protobuf_uint64);
 TENSOR_PROTO_EXTRACT_TYPE(uint16, int, int32);
 TENSOR_PROTO_EXTRACT_TYPE(uint8, int, int32);
 TENSOR_PROTO_EXTRACT_TYPE(int8, int, int32);
diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index 485baa16f39..3675c26751c 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -158,65 +158,6 @@ func FakeQuantWithMinMaxArgsGradient(scope *Scope, gradients tf.Output, inputs t
 	return op.Output(0)
 }
 
-// Applies sparse addition to `input` using individual values or slices
-//
-// from `updates` according to indices `indices`.  The updates are non-aliasing:
-// `input` is only modified in-place if no other operations will use it.
-// Otherwise, a copy of `input` is made.  This operation has a gradient with
-// respect to both `input` and `updates`.
-//
-// `input` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
-//
-// `indices` must be integer tensor, containing indices into `input`.
-// It must be shape \\([d_0, ..., d_{Q-2}, K]\\) where `0 < K <= P`.
-//
-// The innermost dimension of `indices` (with length `K`) corresponds to
-// indices into elements (if `K = P`) or `(P-K)`-dimensional slices
-// (if `K < P`) along the `K`th dimension of `input`.
-//
-// `updates` is `Tensor` of rank `Q-1+P-K` with shape:
-//
-// $$[d_0, ..., d_{Q-2}, input.shape[K], ..., input.shape[P-1]].$$
-//
-// For example, say we want to add 4 scattered elements to a rank-1 tensor to 8
-// elements. In Python, that addition would look like this:
-//
-//     input = tf.constant([1, 2, 3, 4, 5, 6, 7, 8])
-//     indices = tf.constant([[4], [3], [1], [7]])
-//     updates = tf.constant([9, 10, 11, 12])
-//     output = tf.scatter_nd_non_aliasing_add(input, indices, updates)
-//     with tf.Session() as sess:
-//       print(sess.run(output))
-//
-// The resulting value `output` would look like this:
-//
-//     [1, 13, 3, 14, 14, 6, 7, 20]
-//
-// See `tf.scatter_nd` for more details about how to make updates to slices.
-//
-// Arguments:
-//	input: A Tensor.
-//	indices: A Tensor. Must be one of the following types: `int32`, `int64`.
-// A tensor of indices into `input`.
-//	updates: A Tensor. Must have the same type as ref. A tensor of updated values
-// to add to `input`.
-//
-// Returns A `Tensor` with the same shape as `input`, containing values of `input`
-// updated with `updates`.
-func ScatterNdNonAliasingAdd(scope *Scope, input tf.Output, indices tf.Output, updates tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ScatterNdNonAliasingAdd",
-		Input: []tf.Input{
-			input, indices, updates,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Subtracts sparse `updates` from an existing tensor according to `indices`.
 //
 // This operation creates a new tensor by subtracting sparse `updates` from the
@@ -10292,56 +10233,6 @@ func TensorArraySizeV2(scope *Scope, handle tf.Output, flow_in tf.Output) (size
 	return op.Output(0)
 }
 
-// Elementwise computes the bitwise right-shift of `x` and `y`.
-//
-// Performs a logical shift for unsigned integer types, and an arithmetic shift
-// for signed integer types.
-//
-// If `y` is negative, or greater than or equal to than the width of `x` in bits
-// the result is implementation defined.
-//
-// Example:
-//
-// ```python
-// import tensorflow as tf
-// from tensorflow.python.ops import bitwise_ops
-// import numpy as np
-// dtype_list = [tf.int8, tf.int16, tf.int32, tf.int64]
-//
-// for dtype in dtype_list:
-//   lhs = tf.constant([-1, -5, -3, -14], dtype=dtype)
-//   rhs = tf.constant([5, 0, 7, 11], dtype=dtype)
-//
-//   right_shift_result = bitwise_ops.right_shift(lhs, rhs)
-//
-//   print(right_shift_result)
-//
-// # This will print:
-// # tf.Tensor([-1 -5 -1 -1], shape=(4,), dtype=int8)
-// # tf.Tensor([-1 -5 -1 -1], shape=(4,), dtype=int16)
-// # tf.Tensor([-1 -5 -1 -1], shape=(4,), dtype=int32)
-// # tf.Tensor([-1 -5 -1 -1], shape=(4,), dtype=int64)
-//
-// lhs = np.array([-2, 64, 101, 32], dtype=np.int8)
-// rhs = np.array([-1, -5, -3, -14], dtype=np.int8)
-// bitwise_ops.right_shift(lhs, rhs)
-// # <tf.Tensor: shape=(4,), dtype=int8, numpy=array([ -2,  64, 101,  32], dtype=int8)>
-// ```
-//
-func RightShift(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "RightShift",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // RebatchDatasetAttr is an optional argument to RebatchDataset.
 type RebatchDatasetAttr func(optionalAttr)
 
@@ -11557,6 +11448,11 @@ func AssertNextDataset(scope *Scope, input_dataset tf.Output, transformations tf
 }
 
 // Return the index of device the op runs.
+//
+// Given a list of device names, this operation returns the index of the device
+// this op runs. The length of the list is returned in two cases:
+// (1) Device does not exist in the given device list.
+// (2) It is in XLA compilation.
 func DeviceIndex(scope *Scope, device_names []string) (index tf.Output) {
 	if scope.Err() != nil {
 		return
@@ -12732,6 +12628,77 @@ func AdjustSaturation(scope *Scope, images tf.Output, scale tf.Output) (output t
 	return op.Output(0)
 }
 
+// Adjust the contrast of one or more images.
+//
+// `images` is a tensor of at least 3 dimensions.  The last 3 dimensions are
+// interpreted as `[height, width, channels]`.  The other dimensions only
+// represent a collection of images, such as `[batch, height, width, channels].`
+//
+// Contrast is adjusted independently for each channel of each image.
+//
+// For each channel, the Op first computes the mean of the image pixels in the
+// channel and then adjusts each component of each pixel to
+// `(x - mean) * contrast_factor + mean`.
+//
+// Arguments:
+//	images: Images to adjust.  At least 3-D.
+//	contrast_factor: A float multiplier for adjusting contrast.
+//
+// Returns The contrast-adjusted image or images.
+func AdjustContrastv2(scope *Scope, images tf.Output, contrast_factor tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "AdjustContrastv2",
+		Input: []tf.Input{
+			images, contrast_factor,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Initializes the multi device iterator with the given dataset.
+//
+// Arguments:
+//	dataset: Dataset to be iterated upon.
+//	multi_device_iterator: A MultiDeviceIteratorResource.
+//	max_buffer_size: The maximum size of the host side per device buffer to keep.
+//
+// Returns An int64 indicating which incarnation of the MultiDeviceIterator
+// is running.
+func MultiDeviceIteratorInit(scope *Scope, dataset tf.Output, multi_device_iterator tf.Output, max_buffer_size tf.Output) (incarnation_id tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "MultiDeviceIteratorInit",
+		Input: []tf.Input{
+			dataset, multi_device_iterator, max_buffer_size,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Deprecated. Disallowed in GraphDef version >= 2.
+//
+// DEPRECATED at GraphDef version 2: Use AdjustContrastv2 instead
+func AdjustContrast(scope *Scope, images tf.Output, contrast_factor tf.Output, min_value tf.Output, max_value tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "AdjustContrast",
+		Input: []tf.Input{
+			images, contrast_factor, min_value, max_value,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // ExtractJpegShapeAttr is an optional argument to ExtractJpegShape.
 type ExtractJpegShapeAttr func(optionalAttr)
 
@@ -12773,6 +12740,312 @@ func ExtractJpegShape(scope *Scope, contents tf.Output, optional ...ExtractJpegS
 	return op.Output(0)
 }
 
+// JPEG encode input image with provided compression quality.
+//
+// `image` is a 3-D uint8 Tensor of shape `[height, width, channels]`.
+// `quality` is an int32 jpeg compression quality value between 0 and 100.
+//
+//
+// Arguments:
+//	images: Images to adjust.  At least 3-D.
+//	quality: An int quality to encode to.
+//
+// Returns 0-D. JPEG-encoded image.
+func EncodeJpegVariableQuality(scope *Scope, images tf.Output, quality tf.Output) (contents tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "EncodeJpegVariableQuality",
+		Input: []tf.Input{
+			images, quality,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns the rank of a tensor.
+//
+// This operation returns an integer representing the rank of `input`.
+//
+// For example:
+//
+// ```
+// # 't' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]
+// # shape of tensor 't' is [2, 2, 3]
+// rank(t) ==> 3
+// ```
+//
+// **Note**: The rank of a tensor is not the same as the rank of a matrix. The rank
+// of a tensor is the number of indices required to uniquely select each element
+// of the tensor. Rank is also known as "order", "degree", or "ndims."
+func Rank(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Rank",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// EncodeJpegAttr is an optional argument to EncodeJpeg.
+type EncodeJpegAttr func(optionalAttr)
+
+// EncodeJpegFormat sets the optional format attribute to value.
+//
+// value: Per pixel image format.
+// If not specified, defaults to ""
+func EncodeJpegFormat(value string) EncodeJpegAttr {
+	return func(m optionalAttr) {
+		m["format"] = value
+	}
+}
+
+// EncodeJpegQuality sets the optional quality attribute to value.
+//
+// value: Quality of the compression from 0 to 100 (higher is better and slower).
+// If not specified, defaults to 95
+func EncodeJpegQuality(value int64) EncodeJpegAttr {
+	return func(m optionalAttr) {
+		m["quality"] = value
+	}
+}
+
+// EncodeJpegProgressive sets the optional progressive attribute to value.
+//
+// value: If True, create a JPEG that loads progressively (coarse to fine).
+// If not specified, defaults to false
+func EncodeJpegProgressive(value bool) EncodeJpegAttr {
+	return func(m optionalAttr) {
+		m["progressive"] = value
+	}
+}
+
+// EncodeJpegOptimizeSize sets the optional optimize_size attribute to value.
+//
+// value: If True, spend CPU/RAM to reduce size with no quality change.
+// If not specified, defaults to false
+func EncodeJpegOptimizeSize(value bool) EncodeJpegAttr {
+	return func(m optionalAttr) {
+		m["optimize_size"] = value
+	}
+}
+
+// EncodeJpegChromaDownsampling sets the optional chroma_downsampling attribute to value.
+//
+// value: See http://en.wikipedia.org/wiki/Chroma_subsampling.
+// If not specified, defaults to true
+func EncodeJpegChromaDownsampling(value bool) EncodeJpegAttr {
+	return func(m optionalAttr) {
+		m["chroma_downsampling"] = value
+	}
+}
+
+// EncodeJpegDensityUnit sets the optional density_unit attribute to value.
+//
+// value: Unit used to specify `x_density` and `y_density`:
+// pixels per inch (`'in'`) or centimeter (`'cm'`).
+// If not specified, defaults to "in"
+func EncodeJpegDensityUnit(value string) EncodeJpegAttr {
+	return func(m optionalAttr) {
+		m["density_unit"] = value
+	}
+}
+
+// EncodeJpegXDensity sets the optional x_density attribute to value.
+//
+// value: Horizontal pixels per density unit.
+// If not specified, defaults to 300
+func EncodeJpegXDensity(value int64) EncodeJpegAttr {
+	return func(m optionalAttr) {
+		m["x_density"] = value
+	}
+}
+
+// EncodeJpegYDensity sets the optional y_density attribute to value.
+//
+// value: Vertical pixels per density unit.
+// If not specified, defaults to 300
+func EncodeJpegYDensity(value int64) EncodeJpegAttr {
+	return func(m optionalAttr) {
+		m["y_density"] = value
+	}
+}
+
+// EncodeJpegXmpMetadata sets the optional xmp_metadata attribute to value.
+//
+// value: If not empty, embed this XMP metadata in the image header.
+// If not specified, defaults to ""
+func EncodeJpegXmpMetadata(value string) EncodeJpegAttr {
+	return func(m optionalAttr) {
+		m["xmp_metadata"] = value
+	}
+}
+
+// JPEG-encode an image.
+//
+// `image` is a 3-D uint8 Tensor of shape `[height, width, channels]`.
+//
+// The attr `format` can be used to override the color format of the encoded
+// output.  Values can be:
+//
+// *   `''`: Use a default format based on the number of channels in the image.
+// *   `grayscale`: Output a grayscale JPEG image.  The `channels` dimension
+//     of `image` must be 1.
+// *   `rgb`: Output an RGB JPEG image. The `channels` dimension
+//     of `image` must be 3.
+//
+// If `format` is not specified or is the empty string, a default format is picked
+// in function of the number of channels in `image`:
+//
+// *   1: Output a grayscale image.
+// *   3: Output an RGB image.
+//
+// Arguments:
+//	image: 3-D with shape `[height, width, channels]`.
+//
+// Returns 0-D. JPEG-encoded image.
+func EncodeJpeg(scope *Scope, image tf.Output, optional ...EncodeJpegAttr) (contents tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "EncodeJpeg",
+		Input: []tf.Input{
+			image,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// DecodeAndCropJpegAttr is an optional argument to DecodeAndCropJpeg.
+type DecodeAndCropJpegAttr func(optionalAttr)
+
+// DecodeAndCropJpegChannels sets the optional channels attribute to value.
+//
+// value: Number of color channels for the decoded image.
+// If not specified, defaults to 0
+func DecodeAndCropJpegChannels(value int64) DecodeAndCropJpegAttr {
+	return func(m optionalAttr) {
+		m["channels"] = value
+	}
+}
+
+// DecodeAndCropJpegRatio sets the optional ratio attribute to value.
+//
+// value: Downscaling ratio.
+// If not specified, defaults to 1
+func DecodeAndCropJpegRatio(value int64) DecodeAndCropJpegAttr {
+	return func(m optionalAttr) {
+		m["ratio"] = value
+	}
+}
+
+// DecodeAndCropJpegFancyUpscaling sets the optional fancy_upscaling attribute to value.
+//
+// value: If true use a slower but nicer upscaling of the
+// chroma planes (yuv420/422 only).
+// If not specified, defaults to true
+func DecodeAndCropJpegFancyUpscaling(value bool) DecodeAndCropJpegAttr {
+	return func(m optionalAttr) {
+		m["fancy_upscaling"] = value
+	}
+}
+
+// DecodeAndCropJpegTryRecoverTruncated sets the optional try_recover_truncated attribute to value.
+//
+// value: If true try to recover an image from truncated input.
+// If not specified, defaults to false
+func DecodeAndCropJpegTryRecoverTruncated(value bool) DecodeAndCropJpegAttr {
+	return func(m optionalAttr) {
+		m["try_recover_truncated"] = value
+	}
+}
+
+// DecodeAndCropJpegAcceptableFraction sets the optional acceptable_fraction attribute to value.
+//
+// value: The minimum required fraction of lines before a truncated
+// input is accepted.
+// If not specified, defaults to 1
+func DecodeAndCropJpegAcceptableFraction(value float32) DecodeAndCropJpegAttr {
+	return func(m optionalAttr) {
+		m["acceptable_fraction"] = value
+	}
+}
+
+// DecodeAndCropJpegDctMethod sets the optional dct_method attribute to value.
+//
+// value: string specifying a hint about the algorithm used for
+// decompression.  Defaults to "" which maps to a system-specific
+// default.  Currently valid values are ["INTEGER_FAST",
+// "INTEGER_ACCURATE"].  The hint may be ignored (e.g., the internal
+// jpeg library changes to a version that does not have that specific
+// option.)
+// If not specified, defaults to ""
+func DecodeAndCropJpegDctMethod(value string) DecodeAndCropJpegAttr {
+	return func(m optionalAttr) {
+		m["dct_method"] = value
+	}
+}
+
+// Decode and Crop a JPEG-encoded image to a uint8 tensor.
+//
+// The attr `channels` indicates the desired number of color channels for the
+// decoded image.
+//
+// Accepted values are:
+//
+// *   0: Use the number of channels in the JPEG-encoded image.
+// *   1: output a grayscale image.
+// *   3: output an RGB image.
+//
+// If needed, the JPEG-encoded image is transformed to match the requested number
+// of color channels.
+//
+// The attr `ratio` allows downscaling the image by an integer factor during
+// decoding.  Allowed values are: 1, 2, 4, and 8.  This is much faster than
+// downscaling the image later.
+//
+//
+// It is equivalent to a combination of decode and crop, but much faster by only
+// decoding partial jpeg image.
+//
+// Arguments:
+//	contents: 0-D.  The JPEG-encoded image.
+//	crop_window: 1-D.  The crop window: [crop_y, crop_x, crop_height, crop_width].
+//
+// Returns 3-D with shape `[height, width, channels]`..
+func DecodeAndCropJpeg(scope *Scope, contents tf.Output, crop_window tf.Output, optional ...DecodeAndCropJpegAttr) (image tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "DecodeAndCropJpeg",
+		Input: []tf.Input{
+			contents, crop_window,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // RandomCropAttr is an optional argument to RandomCrop.
 type RandomCropAttr func(optionalAttr)
 
@@ -12883,155 +13156,6 @@ func ResizeNearestNeighborGrad(scope *Scope, grads tf.Output, size tf.Output, op
 	return op.Output(0)
 }
 
-// Initializes the multi device iterator with the given dataset.
-//
-// Arguments:
-//	dataset: Dataset to be iterated upon.
-//	multi_device_iterator: A MultiDeviceIteratorResource.
-//	max_buffer_size: The maximum size of the host side per device buffer to keep.
-//
-// Returns An int64 indicating which incarnation of the MultiDeviceIterator
-// is running.
-func MultiDeviceIteratorInit(scope *Scope, dataset tf.Output, multi_device_iterator tf.Output, max_buffer_size tf.Output) (incarnation_id tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "MultiDeviceIteratorInit",
-		Input: []tf.Input{
-			dataset, multi_device_iterator, max_buffer_size,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Deprecated. Disallowed in GraphDef version >= 2.
-//
-// DEPRECATED at GraphDef version 2: Use AdjustContrastv2 instead
-func AdjustContrast(scope *Scope, images tf.Output, contrast_factor tf.Output, min_value tf.Output, max_value tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "AdjustContrast",
-		Input: []tf.Input{
-			images, contrast_factor, min_value, max_value,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// A placeholder op for a value that will be fed into the computation.
-//
-// Arguments:
-//	dtype: The type of elements in the tensor.
-//	shape: The shape of the tensor.
-//
-// Returns A tensor that will be provided using the infeed mechanism.
-func InfeedDequeue(scope *Scope, dtype tf.DataType, shape tf.Shape) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtype": dtype, "shape": shape}
-	opspec := tf.OpSpec{
-		Type: "InfeedDequeue",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Encodes a `RaggedTensor` into a `variant` Tensor.
-//
-//
-// Encodes the given `RaggedTensor` and returns a `variant` Tensor. If
-// `batched_input` is True, then input `RaggedTensor` is unbatched along the
-// zero-th dimension, each component `RaggedTensor` is encoded into a scalar
-// `variant` Tensor, and these are stacked to return a 1-D `variant` Tensor.
-// If `batched_input` is False, then the input `RaggedTensor` is encoded as is and
-// a scalar `variant` Tensor is returned. A `RaggedTensor` is encoded by first
-// creating a 1-D `variant` Tensor with `ragged_rank + 1` elements, containing the
-// splits and values Tensors of the `RaggedTensor`. Then the 1-D `variant` Tensor
-// is wrapped in a scalar `variant` Tensor. See `RaggedTensorFromVariant` for the
-// corresponding decoding logic.
-//
-//
-// Arguments:
-//	rt_nested_splits: A list of one or more Tensors representing the splits of the input
-// `RaggedTensor`.
-//	rt_dense_values: A Tensor representing the values of the input `RaggedTensor`.
-//	batched_input: A `bool` denoting whether the input is a batched `RaggedTensor`.
-//
-// Returns A `variant` Tensor that containing encoded `RaggedTensor`.
-func RaggedTensorToVariant(scope *Scope, rt_nested_splits []tf.Output, rt_dense_values tf.Output, batched_input bool) (encoded_ragged tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"batched_input": batched_input}
-	opspec := tf.OpSpec{
-		Type: "RaggedTensorToVariant",
-		Input: []tf.Input{
-			tf.OutputList(rt_nested_splits), rt_dense_values,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ResizeNearestNeighborAttr is an optional argument to ResizeNearestNeighbor.
-type ResizeNearestNeighborAttr func(optionalAttr)
-
-// ResizeNearestNeighborAlignCorners sets the optional align_corners attribute to value.
-//
-// value: If true, the centers of the 4 corner pixels of the input and output tensors are
-// aligned, preserving the values at the corner pixels. Defaults to false.
-// If not specified, defaults to false
-func ResizeNearestNeighborAlignCorners(value bool) ResizeNearestNeighborAttr {
-	return func(m optionalAttr) {
-		m["align_corners"] = value
-	}
-}
-
-// ResizeNearestNeighborHalfPixelCenters sets the optional half_pixel_centers attribute to value.
-// If not specified, defaults to false
-func ResizeNearestNeighborHalfPixelCenters(value bool) ResizeNearestNeighborAttr {
-	return func(m optionalAttr) {
-		m["half_pixel_centers"] = value
-	}
-}
-
-// Resize `images` to `size` using nearest neighbor interpolation.
-//
-// Arguments:
-//	images: 4-D with shape `[batch, height, width, channels]`.
-//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
-// new size for the images.
-//
-// Returns 4-D with shape
-// `[batch, new_height, new_width, channels]`.
-func ResizeNearestNeighbor(scope *Scope, images tf.Output, size tf.Output, optional ...ResizeNearestNeighborAttr) (resized_images tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResizeNearestNeighbor",
-		Input: []tf.Input{
-			images, size,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Runs multiple additive regression ensemble predictors on input instances and
 //
 // computes the logits. It is designed to be used during prediction.
@@ -15549,6 +15673,65 @@ func SqrtGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
 	return op.Output(0)
 }
 
+// Applies sparse addition to `input` using individual values or slices
+//
+// from `updates` according to indices `indices`.  The updates are non-aliasing:
+// `input` is only modified in-place if no other operations will use it.
+// Otherwise, a copy of `input` is made.  This operation has a gradient with
+// respect to both `input` and `updates`.
+//
+// `input` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
+//
+// `indices` must be integer tensor, containing indices into `input`.
+// It must be shape \\([d_0, ..., d_{Q-2}, K]\\) where `0 < K <= P`.
+//
+// The innermost dimension of `indices` (with length `K`) corresponds to
+// indices into elements (if `K = P`) or `(P-K)`-dimensional slices
+// (if `K < P`) along the `K`th dimension of `input`.
+//
+// `updates` is `Tensor` of rank `Q-1+P-K` with shape:
+//
+// $$[d_0, ..., d_{Q-2}, input.shape[K], ..., input.shape[P-1]].$$
+//
+// For example, say we want to add 4 scattered elements to a rank-1 tensor to 8
+// elements. In Python, that addition would look like this:
+//
+//     input = tf.constant([1, 2, 3, 4, 5, 6, 7, 8])
+//     indices = tf.constant([[4], [3], [1], [7]])
+//     updates = tf.constant([9, 10, 11, 12])
+//     output = tf.scatter_nd_non_aliasing_add(input, indices, updates)
+//     with tf.Session() as sess:
+//       print(sess.run(output))
+//
+// The resulting value `output` would look like this:
+//
+//     [1, 13, 3, 14, 14, 6, 7, 20]
+//
+// See `tf.scatter_nd` for more details about how to make updates to slices.
+//
+// Arguments:
+//	input: A Tensor.
+//	indices: A Tensor. Must be one of the following types: `int32`, `int64`.
+// A tensor of indices into `input`.
+//	updates: A Tensor. Must have the same type as ref. A tensor of updated values
+// to add to `input`.
+//
+// Returns A `Tensor` with the same shape as `input`, containing values of `input`
+// updated with `updates`.
+func ScatterNdNonAliasingAdd(scope *Scope, input tf.Output, indices tf.Output, updates tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ScatterNdNonAliasingAdd",
+		Input: []tf.Input{
+			input, indices, updates,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // MutableHashTableOfTensorsV2Attr is an optional argument to MutableHashTableOfTensorsV2.
 type MutableHashTableOfTensorsV2Attr func(optionalAttr)
 
@@ -20981,26 +21164,6 @@ func Floor(scope *Scope, x tf.Output) (y tf.Output) {
 	return op.Output(0)
 }
 
-// Computes the Bessel i0e function of `x` element-wise.
-//
-// Exponentially scaled modified Bessel function of order 0 defined as
-// `bessel_i0e(x) = exp(-abs(x)) bessel_i0(x)`.
-//
-// This function is faster and numerically stabler than `bessel_i0(x)`.
-func BesselI0e(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "BesselI0e",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Computes the trignometric inverse tangent of x element-wise.
 //
 // The `tf.math.atan` operation returns the inverse of `tf.math.tan`, such that
@@ -22983,122 +23146,6 @@ func QuantizedDepthwiseConv2D(scope *Scope, input tf.Output, filter tf.Output, m
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// DecodeAndCropJpegAttr is an optional argument to DecodeAndCropJpeg.
-type DecodeAndCropJpegAttr func(optionalAttr)
-
-// DecodeAndCropJpegChannels sets the optional channels attribute to value.
-//
-// value: Number of color channels for the decoded image.
-// If not specified, defaults to 0
-func DecodeAndCropJpegChannels(value int64) DecodeAndCropJpegAttr {
-	return func(m optionalAttr) {
-		m["channels"] = value
-	}
-}
-
-// DecodeAndCropJpegRatio sets the optional ratio attribute to value.
-//
-// value: Downscaling ratio.
-// If not specified, defaults to 1
-func DecodeAndCropJpegRatio(value int64) DecodeAndCropJpegAttr {
-	return func(m optionalAttr) {
-		m["ratio"] = value
-	}
-}
-
-// DecodeAndCropJpegFancyUpscaling sets the optional fancy_upscaling attribute to value.
-//
-// value: If true use a slower but nicer upscaling of the
-// chroma planes (yuv420/422 only).
-// If not specified, defaults to true
-func DecodeAndCropJpegFancyUpscaling(value bool) DecodeAndCropJpegAttr {
-	return func(m optionalAttr) {
-		m["fancy_upscaling"] = value
-	}
-}
-
-// DecodeAndCropJpegTryRecoverTruncated sets the optional try_recover_truncated attribute to value.
-//
-// value: If true try to recover an image from truncated input.
-// If not specified, defaults to false
-func DecodeAndCropJpegTryRecoverTruncated(value bool) DecodeAndCropJpegAttr {
-	return func(m optionalAttr) {
-		m["try_recover_truncated"] = value
-	}
-}
-
-// DecodeAndCropJpegAcceptableFraction sets the optional acceptable_fraction attribute to value.
-//
-// value: The minimum required fraction of lines before a truncated
-// input is accepted.
-// If not specified, defaults to 1
-func DecodeAndCropJpegAcceptableFraction(value float32) DecodeAndCropJpegAttr {
-	return func(m optionalAttr) {
-		m["acceptable_fraction"] = value
-	}
-}
-
-// DecodeAndCropJpegDctMethod sets the optional dct_method attribute to value.
-//
-// value: string specifying a hint about the algorithm used for
-// decompression.  Defaults to "" which maps to a system-specific
-// default.  Currently valid values are ["INTEGER_FAST",
-// "INTEGER_ACCURATE"].  The hint may be ignored (e.g., the internal
-// jpeg library changes to a version that does not have that specific
-// option.)
-// If not specified, defaults to ""
-func DecodeAndCropJpegDctMethod(value string) DecodeAndCropJpegAttr {
-	return func(m optionalAttr) {
-		m["dct_method"] = value
-	}
-}
-
-// Decode and Crop a JPEG-encoded image to a uint8 tensor.
-//
-// The attr `channels` indicates the desired number of color channels for the
-// decoded image.
-//
-// Accepted values are:
-//
-// *   0: Use the number of channels in the JPEG-encoded image.
-// *   1: output a grayscale image.
-// *   3: output an RGB image.
-//
-// If needed, the JPEG-encoded image is transformed to match the requested number
-// of color channels.
-//
-// The attr `ratio` allows downscaling the image by an integer factor during
-// decoding.  Allowed values are: 1, 2, 4, and 8.  This is much faster than
-// downscaling the image later.
-//
-//
-// It is equivalent to a combination of decode and crop, but much faster by only
-// decoding partial jpeg image.
-//
-// Arguments:
-//	contents: 0-D.  The JPEG-encoded image.
-//	crop_window: 1-D.  The crop window: [crop_y, crop_x, crop_height, crop_width].
-//
-// Returns 3-D with shape `[height, width, channels]`..
-func DecodeAndCropJpeg(scope *Scope, contents tf.Output, crop_window tf.Output, optional ...DecodeAndCropJpegAttr) (image tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "DecodeAndCropJpeg",
-		Input: []tf.Input{
-			contents, crop_window,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // QuantizedConv2DPerChannelAttr is an optional argument to QuantizedConv2DPerChannel.
 type QuantizedConv2DPerChannelAttr func(optionalAttr)
 
@@ -23468,318 +23515,6 @@ func RetrieveTPUEmbeddingRMSPropParameters(scope *Scope, num_shards int64, shard
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Conv3DBackpropInputV2Attr is an optional argument to Conv3DBackpropInputV2.
-type Conv3DBackpropInputV2Attr func(optionalAttr)
-
-// Conv3DBackpropInputV2DataFormat sets the optional data_format attribute to value.
-//
-// value: The data format of the input and output data. With the
-// default format "NDHWC", the data is stored in the order of:
-//     [batch, in_depth, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCDHW", the data storage order is:
-//     [batch, in_channels, in_depth, in_height, in_width].
-// If not specified, defaults to "NDHWC"
-func Conv3DBackpropInputV2DataFormat(value string) Conv3DBackpropInputV2Attr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// Conv3DBackpropInputV2Dilations sets the optional dilations attribute to value.
-//
-// value: 1-D tensor of length 5.  The dilation factor for each dimension of
-// `input`. If set to k > 1, there will be k-1 skipped cells between each
-// filter element on that dimension. The dimension order is determined by the
-// value of `data_format`, see above for details. Dilations in the batch and
-// depth dimensions must be 1.
-// If not specified, defaults to <i:1 i:1 i:1 i:1 i:1 >
-func Conv3DBackpropInputV2Dilations(value []int64) Conv3DBackpropInputV2Attr {
-	return func(m optionalAttr) {
-		m["dilations"] = value
-	}
-}
-
-// Computes the gradients of 3-D convolution with respect to the input.
-//
-// Arguments:
-//	input_sizes: An integer vector representing the tensor shape of `input`,
-// where `input` is a 5-D
-// `[batch, depth, rows, cols, in_channels]` tensor.
-//	filter: Shape `[depth, rows, cols, in_channels, out_channels]`.
-// `in_channels` must match between `input` and `filter`.
-//	out_backprop: Backprop signal of shape `[batch, out_depth, out_rows, out_cols,
-// out_channels]`.
-//	strides: 1-D tensor of length 5. The stride of the sliding window for each
-// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-//	padding: The type of padding algorithm to use.
-func Conv3DBackpropInputV2(scope *Scope, input_sizes tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv3DBackpropInputV2Attr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Conv3DBackpropInputV2",
-		Input: []tf.Input{
-			input_sizes, filter, out_backprop,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// LRNAttr is an optional argument to LRN.
-type LRNAttr func(optionalAttr)
-
-// LRNDepthRadius sets the optional depth_radius attribute to value.
-//
-// value: 0-D.  Half-width of the 1-D normalization window.
-// If not specified, defaults to 5
-func LRNDepthRadius(value int64) LRNAttr {
-	return func(m optionalAttr) {
-		m["depth_radius"] = value
-	}
-}
-
-// LRNBias sets the optional bias attribute to value.
-//
-// value: An offset (usually positive to avoid dividing by 0).
-// If not specified, defaults to 1
-func LRNBias(value float32) LRNAttr {
-	return func(m optionalAttr) {
-		m["bias"] = value
-	}
-}
-
-// LRNAlpha sets the optional alpha attribute to value.
-//
-// value: A scale factor, usually positive.
-// If not specified, defaults to 1
-func LRNAlpha(value float32) LRNAttr {
-	return func(m optionalAttr) {
-		m["alpha"] = value
-	}
-}
-
-// LRNBeta sets the optional beta attribute to value.
-//
-// value: An exponent.
-// If not specified, defaults to 0.5
-func LRNBeta(value float32) LRNAttr {
-	return func(m optionalAttr) {
-		m["beta"] = value
-	}
-}
-
-// Local Response Normalization.
-//
-// The 4-D `input` tensor is treated as a 3-D array of 1-D vectors (along the last
-// dimension), and each vector is normalized independently.  Within a given vector,
-// each component is divided by the weighted, squared sum of inputs within
-// `depth_radius`.  In detail,
-//
-//     sqr_sum[a, b, c, d] =
-//         sum(input[a, b, c, d - depth_radius : d + depth_radius + 1] ** 2)
-//     output = input / (bias + alpha * sqr_sum) ** beta
-//
-// For details, see [Krizhevsky et al., ImageNet classification with deep
-// convolutional neural networks (NIPS 2012)](http://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks).
-//
-// Arguments:
-//	input: 4-D.
-func LRN(scope *Scope, input tf.Output, optional ...LRNAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "LRN",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// QuantizedMatMulWithBiasAndReluAndRequantizeAttr is an optional argument to QuantizedMatMulWithBiasAndReluAndRequantize.
-type QuantizedMatMulWithBiasAndReluAndRequantizeAttr func(optionalAttr)
-
-// QuantizedMatMulWithBiasAndReluAndRequantizeToutput sets the optional Toutput attribute to value.
-// If not specified, defaults to DT_QUINT8
-func QuantizedMatMulWithBiasAndReluAndRequantizeToutput(value tf.DataType) QuantizedMatMulWithBiasAndReluAndRequantizeAttr {
-	return func(m optionalAttr) {
-		m["Toutput"] = value
-	}
-}
-
-// QuantizedMatMulWithBiasAndReluAndRequantizeTransposeA sets the optional transpose_a attribute to value.
-//
-// value: If true, `a` is transposed before multiplication.
-// If not specified, defaults to false
-func QuantizedMatMulWithBiasAndReluAndRequantizeTransposeA(value bool) QuantizedMatMulWithBiasAndReluAndRequantizeAttr {
-	return func(m optionalAttr) {
-		m["transpose_a"] = value
-	}
-}
-
-// QuantizedMatMulWithBiasAndReluAndRequantizeTransposeB sets the optional transpose_b attribute to value.
-//
-// value: If true, `b` is transposed before multiplication.
-// If not specified, defaults to false
-func QuantizedMatMulWithBiasAndReluAndRequantizeTransposeB(value bool) QuantizedMatMulWithBiasAndReluAndRequantizeAttr {
-	return func(m optionalAttr) {
-		m["transpose_b"] = value
-	}
-}
-
-// QuantizedMatMulWithBiasAndReluAndRequantizeInputQuantMode sets the optional input_quant_mode attribute to value.
-//
-// value: Input data quantization mode. Either MIN_FIRST(default) or SCALED.
-// If not specified, defaults to "MIN_FIRST"
-func QuantizedMatMulWithBiasAndReluAndRequantizeInputQuantMode(value string) QuantizedMatMulWithBiasAndReluAndRequantizeAttr {
-	return func(m optionalAttr) {
-		m["input_quant_mode"] = value
-	}
-}
-
-// Perform a quantized matrix multiplication of  `a` by the matrix `b` with bias
-// add and relu and requantize fusion.
-//
-// The inputs must be two-dimensional matrices and 1D bias vector. And the inner
-// dimension of `a` (after being transposed if `transpose_a` is non-zero) must
-// match the outer dimension of `b` (after being transposed if `transposed_b` is
-// non-zero). Then do broadcast add operation with bias values on the matrix
-// multiplication result. The bias size must match inner dimension of `b`.  Then do
-// relu activation to get non-negative result. Then do requantize operation to get
-// final uint8 result.
-//
-// Arguments:
-//	a: A matrix to be multiplied. Must be a two-dimensional tensor of type `quint8`.
-//	b: A matrix to be multiplied and must be a two-dimensional tensor of type `qint8`.
-//	bias: A 1D bias tensor with size matching with inner dimension of `b` (after being
-// transposed if `transposed_b` is non-zero).
-//	min_a: The float value that the lowest quantized `a` value represents.
-//	max_a: The float value that the highest quantized `a` value represents.
-//	min_b: The float value that the lowest quantized `b` value represents.
-//	max_b: The float value that the highest quantized `b` value represents.
-//	min_freezed_output: The float value that the highest quantized output value after requantize.
-//
-//
-// Returns:
-//	out
-//	min_out: The float value that the lowest quantized output value represents.
-//	max_out: The float value that the highest quantized output value represents.
-func QuantizedMatMulWithBiasAndReluAndRequantize(scope *Scope, a tf.Output, b tf.Output, bias tf.Output, min_a tf.Output, max_a tf.Output, min_b tf.Output, max_b tf.Output, min_freezed_output tf.Output, max_freezed_output tf.Output, optional ...QuantizedMatMulWithBiasAndReluAndRequantizeAttr) (out tf.Output, min_out tf.Output, max_out tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "QuantizedMatMulWithBiasAndReluAndRequantize",
-		Input: []tf.Input{
-			a, b, bias, min_a, max_a, min_b, max_b, min_freezed_output, max_freezed_output,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// QuantizedMatMulWithBiasAndReluAttr is an optional argument to QuantizedMatMulWithBiasAndRelu.
-type QuantizedMatMulWithBiasAndReluAttr func(optionalAttr)
-
-// QuantizedMatMulWithBiasAndReluToutput sets the optional Toutput attribute to value.
-// If not specified, defaults to DT_QINT32
-func QuantizedMatMulWithBiasAndReluToutput(value tf.DataType) QuantizedMatMulWithBiasAndReluAttr {
-	return func(m optionalAttr) {
-		m["Toutput"] = value
-	}
-}
-
-// QuantizedMatMulWithBiasAndReluTransposeA sets the optional transpose_a attribute to value.
-//
-// value: If true, `a` is transposed before multiplication.
-// If not specified, defaults to false
-func QuantizedMatMulWithBiasAndReluTransposeA(value bool) QuantizedMatMulWithBiasAndReluAttr {
-	return func(m optionalAttr) {
-		m["transpose_a"] = value
-	}
-}
-
-// QuantizedMatMulWithBiasAndReluTransposeB sets the optional transpose_b attribute to value.
-//
-// value: If true, `b` is transposed before multiplication.
-// If not specified, defaults to false
-func QuantizedMatMulWithBiasAndReluTransposeB(value bool) QuantizedMatMulWithBiasAndReluAttr {
-	return func(m optionalAttr) {
-		m["transpose_b"] = value
-	}
-}
-
-// QuantizedMatMulWithBiasAndReluInputQuantMode sets the optional input_quant_mode attribute to value.
-//
-// value: Input data quantization mode. Either MIN_FIRST(default) or SCALED.
-// If not specified, defaults to "MIN_FIRST"
-func QuantizedMatMulWithBiasAndReluInputQuantMode(value string) QuantizedMatMulWithBiasAndReluAttr {
-	return func(m optionalAttr) {
-		m["input_quant_mode"] = value
-	}
-}
-
-// Perform a quantized matrix multiplication of  `a` by the matrix `b` with bias
-// add and relu fusion.
-//
-// The inputs must be two-dimensional matrices and 1D bias vector. And the inner
-// dimension of `a` (after being transposed if `transpose_a` is non-zero) must
-// match the outer dimension of `b` (after being transposed if `transposed_b` is
-// non-zero). Then do broadcast add operation with bias values on the matrix
-// multiplication result. The bias size must match inner dimension of `b`. Then do
-// relu activation to get non-negative result.
-//
-// Arguments:
-//	a: A matrix to be multiplied. Must be a two-dimensional tensor of type `quint8`.
-//	b: A matrix to be multiplied and must be a two-dimensional tensor of type `qint8`.
-//	bias: A 1D bias tensor with size matching with inner dimension of `b` (after being
-// transposed if `transposed_b` is non-zero).
-//	min_a: The float value that the lowest quantized `a` value represents.
-//	max_a: The float value that the highest quantized `a` value represents.
-//	min_b: The float value that the lowest quantized `b` value represents.
-//	max_b: The float value that the highest quantized `b` value represents.
-//
-// Returns:
-//	out
-//	min_out: The float value that the lowest quantized output value represents.
-//	max_out: The float value that the highest quantized output value represents.
-func QuantizedMatMulWithBiasAndRelu(scope *Scope, a tf.Output, b tf.Output, bias tf.Output, min_a tf.Output, max_a tf.Output, min_b tf.Output, max_b tf.Output, optional ...QuantizedMatMulWithBiasAndReluAttr) (out tf.Output, min_out tf.Output, max_out tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "QuantizedMatMulWithBiasAndRelu",
-		Input: []tf.Input{
-			a, b, bias, min_a, max_a, min_b, max_b,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
 // QuantizedMatMulWithBiasAttr is an optional argument to QuantizedMatMulWithBias.
 type QuantizedMatMulWithBiasAttr func(optionalAttr)
 
@@ -25672,6 +25407,234 @@ func Zeta(scope *Scope, x tf.Output, q tf.Output) (z tf.Output) {
 	return op.Output(0)
 }
 
+// QuantizedMatMulWithBiasAndReluAndRequantizeAttr is an optional argument to QuantizedMatMulWithBiasAndReluAndRequantize.
+type QuantizedMatMulWithBiasAndReluAndRequantizeAttr func(optionalAttr)
+
+// QuantizedMatMulWithBiasAndReluAndRequantizeToutput sets the optional Toutput attribute to value.
+// If not specified, defaults to DT_QUINT8
+func QuantizedMatMulWithBiasAndReluAndRequantizeToutput(value tf.DataType) QuantizedMatMulWithBiasAndReluAndRequantizeAttr {
+	return func(m optionalAttr) {
+		m["Toutput"] = value
+	}
+}
+
+// QuantizedMatMulWithBiasAndReluAndRequantizeTransposeA sets the optional transpose_a attribute to value.
+//
+// value: If true, `a` is transposed before multiplication.
+// If not specified, defaults to false
+func QuantizedMatMulWithBiasAndReluAndRequantizeTransposeA(value bool) QuantizedMatMulWithBiasAndReluAndRequantizeAttr {
+	return func(m optionalAttr) {
+		m["transpose_a"] = value
+	}
+}
+
+// QuantizedMatMulWithBiasAndReluAndRequantizeTransposeB sets the optional transpose_b attribute to value.
+//
+// value: If true, `b` is transposed before multiplication.
+// If not specified, defaults to false
+func QuantizedMatMulWithBiasAndReluAndRequantizeTransposeB(value bool) QuantizedMatMulWithBiasAndReluAndRequantizeAttr {
+	return func(m optionalAttr) {
+		m["transpose_b"] = value
+	}
+}
+
+// QuantizedMatMulWithBiasAndReluAndRequantizeInputQuantMode sets the optional input_quant_mode attribute to value.
+//
+// value: Input data quantization mode. Either MIN_FIRST(default) or SCALED.
+// If not specified, defaults to "MIN_FIRST"
+func QuantizedMatMulWithBiasAndReluAndRequantizeInputQuantMode(value string) QuantizedMatMulWithBiasAndReluAndRequantizeAttr {
+	return func(m optionalAttr) {
+		m["input_quant_mode"] = value
+	}
+}
+
+// Perform a quantized matrix multiplication of  `a` by the matrix `b` with bias
+// add and relu and requantize fusion.
+//
+// The inputs must be two-dimensional matrices and 1D bias vector. And the inner
+// dimension of `a` (after being transposed if `transpose_a` is non-zero) must
+// match the outer dimension of `b` (after being transposed if `transposed_b` is
+// non-zero). Then do broadcast add operation with bias values on the matrix
+// multiplication result. The bias size must match inner dimension of `b`.  Then do
+// relu activation to get non-negative result. Then do requantize operation to get
+// final uint8 result.
+//
+// Arguments:
+//	a: A matrix to be multiplied. Must be a two-dimensional tensor of type `quint8`.
+//	b: A matrix to be multiplied and must be a two-dimensional tensor of type `qint8`.
+//	bias: A 1D bias tensor with size matching with inner dimension of `b` (after being
+// transposed if `transposed_b` is non-zero).
+//	min_a: The float value that the lowest quantized `a` value represents.
+//	max_a: The float value that the highest quantized `a` value represents.
+//	min_b: The float value that the lowest quantized `b` value represents.
+//	max_b: The float value that the highest quantized `b` value represents.
+//	min_freezed_output: The float value that the highest quantized output value after requantize.
+//
+//
+// Returns:
+//	out
+//	min_out: The float value that the lowest quantized output value represents.
+//	max_out: The float value that the highest quantized output value represents.
+func QuantizedMatMulWithBiasAndReluAndRequantize(scope *Scope, a tf.Output, b tf.Output, bias tf.Output, min_a tf.Output, max_a tf.Output, min_b tf.Output, max_b tf.Output, min_freezed_output tf.Output, max_freezed_output tf.Output, optional ...QuantizedMatMulWithBiasAndReluAndRequantizeAttr) (out tf.Output, min_out tf.Output, max_out tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "QuantizedMatMulWithBiasAndReluAndRequantize",
+		Input: []tf.Input{
+			a, b, bias, min_a, max_a, min_b, max_b, min_freezed_output, max_freezed_output,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Conv3DBackpropInputV2Attr is an optional argument to Conv3DBackpropInputV2.
+type Conv3DBackpropInputV2Attr func(optionalAttr)
+
+// Conv3DBackpropInputV2DataFormat sets the optional data_format attribute to value.
+//
+// value: The data format of the input and output data. With the
+// default format "NDHWC", the data is stored in the order of:
+//     [batch, in_depth, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCDHW", the data storage order is:
+//     [batch, in_channels, in_depth, in_height, in_width].
+// If not specified, defaults to "NDHWC"
+func Conv3DBackpropInputV2DataFormat(value string) Conv3DBackpropInputV2Attr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Conv3DBackpropInputV2Dilations sets the optional dilations attribute to value.
+//
+// value: 1-D tensor of length 5.  The dilation factor for each dimension of
+// `input`. If set to k > 1, there will be k-1 skipped cells between each
+// filter element on that dimension. The dimension order is determined by the
+// value of `data_format`, see above for details. Dilations in the batch and
+// depth dimensions must be 1.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 i:1 >
+func Conv3DBackpropInputV2Dilations(value []int64) Conv3DBackpropInputV2Attr {
+	return func(m optionalAttr) {
+		m["dilations"] = value
+	}
+}
+
+// Computes the gradients of 3-D convolution with respect to the input.
+//
+// Arguments:
+//	input_sizes: An integer vector representing the tensor shape of `input`,
+// where `input` is a 5-D
+// `[batch, depth, rows, cols, in_channels]` tensor.
+//	filter: Shape `[depth, rows, cols, in_channels, out_channels]`.
+// `in_channels` must match between `input` and `filter`.
+//	out_backprop: Backprop signal of shape `[batch, out_depth, out_rows, out_cols,
+// out_channels]`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	padding: The type of padding algorithm to use.
+func Conv3DBackpropInputV2(scope *Scope, input_sizes tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv3DBackpropInputV2Attr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Conv3DBackpropInputV2",
+		Input: []tf.Input{
+			input_sizes, filter, out_backprop,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// LRNAttr is an optional argument to LRN.
+type LRNAttr func(optionalAttr)
+
+// LRNDepthRadius sets the optional depth_radius attribute to value.
+//
+// value: 0-D.  Half-width of the 1-D normalization window.
+// If not specified, defaults to 5
+func LRNDepthRadius(value int64) LRNAttr {
+	return func(m optionalAttr) {
+		m["depth_radius"] = value
+	}
+}
+
+// LRNBias sets the optional bias attribute to value.
+//
+// value: An offset (usually positive to avoid dividing by 0).
+// If not specified, defaults to 1
+func LRNBias(value float32) LRNAttr {
+	return func(m optionalAttr) {
+		m["bias"] = value
+	}
+}
+
+// LRNAlpha sets the optional alpha attribute to value.
+//
+// value: A scale factor, usually positive.
+// If not specified, defaults to 1
+func LRNAlpha(value float32) LRNAttr {
+	return func(m optionalAttr) {
+		m["alpha"] = value
+	}
+}
+
+// LRNBeta sets the optional beta attribute to value.
+//
+// value: An exponent.
+// If not specified, defaults to 0.5
+func LRNBeta(value float32) LRNAttr {
+	return func(m optionalAttr) {
+		m["beta"] = value
+	}
+}
+
+// Local Response Normalization.
+//
+// The 4-D `input` tensor is treated as a 3-D array of 1-D vectors (along the last
+// dimension), and each vector is normalized independently.  Within a given vector,
+// each component is divided by the weighted, squared sum of inputs within
+// `depth_radius`.  In detail,
+//
+//     sqr_sum[a, b, c, d] =
+//         sum(input[a, b, c, d - depth_radius : d + depth_radius + 1] ** 2)
+//     output = input / (bias + alpha * sqr_sum) ** beta
+//
+// For details, see [Krizhevsky et al., ImageNet classification with deep
+// convolutional neural networks (NIPS 2012)](http://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks).
+//
+// Arguments:
+//	input: 4-D.
+func LRN(scope *Scope, input tf.Output, optional ...LRNAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "LRN",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Returns which elements of x are Inf.
 //
 // @compatibility(numpy)
@@ -26270,34 +26233,6 @@ func FusedPadConv2D(scope *Scope, input tf.Output, paddings tf.Output, filter tf
 	return op.Output(0)
 }
 
-// Adjust the hue of one or more images.
-//
-// `images` is a tensor of at least 3 dimensions.  The last dimension is
-// interpreted as channels, and must be three.
-//
-// The input image is considered in the RGB colorspace. Conceptually, the RGB
-// colors are first mapped into HSV. A delta is then applied all the hue values,
-// and then remapped back to RGB colorspace.
-//
-// Arguments:
-//	images: Images to adjust.  At least 3-D.
-//	delta: A float delta to add to the hue.
-//
-// Returns The hue-adjusted image or images.
-func AdjustHue(scope *Scope, images tf.Output, delta tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "AdjustHue",
-		Input: []tf.Input{
-			images, delta,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // List of the given size with empty elements.
 //
 // element_shape: the shape of the future elements of the list
@@ -26481,36 +26416,267 @@ func BiasAddV1(scope *Scope, value tf.Output, bias tf.Output) (output tf.Output)
 	return op.Output(0)
 }
 
-// BiasAddAttr is an optional argument to BiasAdd.
-type BiasAddAttr func(optionalAttr)
-
-// BiasAddDataFormat sets the optional data_format attribute to value.
+// Creates a Dataset that returns pseudorandom numbers.
 //
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the bias tensor will be added to the last dimension
-// of the value tensor.
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// The tensor will be added to "in_channels", the third-to-the-last
-//     dimension.
-// If not specified, defaults to "NHWC"
-func BiasAddDataFormat(value string) BiasAddAttr {
+// Creates a Dataset that returns a stream of uniformly distributed
+// pseudorandom 64-bit signed integers.
+//
+// In the TensorFlow Python API, you can instantiate this dataset via the
+// class `tf.data.experimental.RandomDataset`.
+//
+// Instances of this dataset are also created as a result of the
+// `hoist_random_uniform` static optimization. Whether this optimization is
+// performed is determined by the `experimental_optimization.hoist_random_uniform`
+// option of `tf.data.Options`.
+//
+// Arguments:
+//	seed: A scalar seed for the random number generator. If either seed or
+// seed2 is set to be non-zero, the random number generator is seeded
+// by the given seed.  Otherwise, a random seed is used.
+//	seed2: A second scalar seed to avoid seed collision.
+//
+//
+func RandomDataset(scope *Scope, seed tf.Output, seed2 tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "RandomDataset",
+		Input: []tf.Input{
+			seed, seed2,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// FractionalAvgPoolAttr is an optional argument to FractionalAvgPool.
+type FractionalAvgPoolAttr func(optionalAttr)
+
+// FractionalAvgPoolPseudoRandom sets the optional pseudo_random attribute to value.
+//
+// value: When set to True, generates the pooling sequence in a
+// pseudorandom fashion, otherwise, in a random fashion. Check paper [Benjamin
+// Graham, Fractional Max-Pooling](http://arxiv.org/abs/1412.6071) for
+// difference between pseudorandom and random.
+// If not specified, defaults to false
+func FractionalAvgPoolPseudoRandom(value bool) FractionalAvgPoolAttr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["pseudo_random"] = value
 	}
 }
 
-// Adds `bias` to `value`.
+// FractionalAvgPoolOverlapping sets the optional overlapping attribute to value.
 //
-// This is a special case of `tf.add` where `bias` is restricted to be 1-D.
-// Broadcasting is supported, so `value` may have any number of dimensions.
+// value: When set to True, it means when pooling, the values at the boundary
+// of adjacent pooling cells are used by both cells. For example:
+//
+// `index  0  1  2  3  4`
+//
+// `value  20 5  16 3  7`
+//
+// If the pooling sequence is [0, 2, 4], then 16, at index 2 will be used twice.
+// The result would be [41/3, 26/3] for fractional avg pooling.
+// If not specified, defaults to false
+func FractionalAvgPoolOverlapping(value bool) FractionalAvgPoolAttr {
+	return func(m optionalAttr) {
+		m["overlapping"] = value
+	}
+}
+
+// FractionalAvgPoolDeterministic sets the optional deterministic attribute to value.
+//
+// value: When set to True, a fixed pooling region will be used when
+// iterating over a FractionalAvgPool node in the computation graph. Mainly used
+// in unit test to make FractionalAvgPool deterministic.
+// If not specified, defaults to false
+func FractionalAvgPoolDeterministic(value bool) FractionalAvgPoolAttr {
+	return func(m optionalAttr) {
+		m["deterministic"] = value
+	}
+}
+
+// FractionalAvgPoolSeed sets the optional seed attribute to value.
+//
+// value: If either seed or seed2 are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func FractionalAvgPoolSeed(value int64) FractionalAvgPoolAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// FractionalAvgPoolSeed2 sets the optional seed2 attribute to value.
+//
+// value: An second seed to avoid seed collision.
+// If not specified, defaults to 0
+func FractionalAvgPoolSeed2(value int64) FractionalAvgPoolAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Performs fractional average pooling on the input.
+//
+// Fractional average pooling is similar to Fractional max pooling in the pooling
+// region generation step. The only difference is that after pooling regions are
+// generated, a mean operation is performed instead of a max operation in each
+// pooling region.
 //
 // Arguments:
-//	value: Any number of dimensions.
-//	bias: 1-D with size the last dimension of `value`.
+//	value: 4-D with shape `[batch, height, width, channels]`.
+//	pooling_ratio: Pooling ratio for each dimension of `value`, currently only
+// supports row and col dimension and should be >= 1.0. For example, a valid
+// pooling ratio looks like [1.0, 1.44, 1.73, 1.0]. The first and last elements
+// must be 1.0 because we don't allow pooling on batch and channels
+// dimensions. 1.44 and 1.73 are pooling ratio on height and width dimensions
+// respectively.
 //
-// Returns Broadcasted sum of `value` and `bias`.
-func BiasAdd(scope *Scope, value tf.Output, bias tf.Output, optional ...BiasAddAttr) (output tf.Output) {
+// Returns:
+//	output: output tensor after fractional avg pooling.
+//	row_pooling_sequence: row pooling sequence, needed to calculate gradient.
+//	col_pooling_sequence: column pooling sequence, needed to calculate gradient.
+func FractionalAvgPool(scope *Scope, value tf.Output, pooling_ratio []float32, optional ...FractionalAvgPoolAttr) (output tf.Output, row_pooling_sequence tf.Output, col_pooling_sequence tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"pooling_ratio": pooling_ratio}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "FractionalAvgPool",
+		Input: []tf.Input{
+			value,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// MapStageAttr is an optional argument to MapStage.
+type MapStageAttr func(optionalAttr)
+
+// MapStageCapacity sets the optional capacity attribute to value.
+//
+// value: Maximum number of elements in the Staging Area. If > 0, inserts
+// on the container will block when the capacity is reached.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func MapStageCapacity(value int64) MapStageAttr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
+	}
+}
+
+// MapStageMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func MapStageMemoryLimit(value int64) MapStageAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
+	}
+}
+
+// MapStageContainer sets the optional container attribute to value.
+//
+// value: If non-empty, this queue is placed in the given container. Otherwise,
+// a default container is used.
+// If not specified, defaults to ""
+func MapStageContainer(value string) MapStageAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// MapStageSharedName sets the optional shared_name attribute to value.
+//
+// value: It is necessary to match this name to the matching Unstage Op.
+// If not specified, defaults to ""
+func MapStageSharedName(value string) MapStageAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Stage (key, values) in the underlying container which behaves like a hashtable.
+//
+// Arguments:
+//	key: int64
+//
+//	values: a list of tensors
+// dtypes A list of data types that inserted values should adhere to.
+//
+//
+// Returns the created operation.
+func MapStage(scope *Scope, key tf.Output, indices tf.Output, values []tf.Output, dtypes []tf.DataType, optional ...MapStageAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtypes": dtypes}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MapStage",
+		Input: []tf.Input{
+			key, indices, tf.OutputList(values),
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// SparseTensorDenseMatMulAttr is an optional argument to SparseTensorDenseMatMul.
+type SparseTensorDenseMatMulAttr func(optionalAttr)
+
+// SparseTensorDenseMatMulAdjointA sets the optional adjoint_a attribute to value.
+//
+// value: Use the adjoint of A in the matrix multiply.  If A is complex, this
+// is transpose(conj(A)).  Otherwise it's transpose(A).
+// If not specified, defaults to false
+func SparseTensorDenseMatMulAdjointA(value bool) SparseTensorDenseMatMulAttr {
+	return func(m optionalAttr) {
+		m["adjoint_a"] = value
+	}
+}
+
+// SparseTensorDenseMatMulAdjointB sets the optional adjoint_b attribute to value.
+//
+// value: Use the adjoint of B in the matrix multiply.  If B is complex, this
+// is transpose(conj(B)).  Otherwise it's transpose(B).
+// If not specified, defaults to false
+func SparseTensorDenseMatMulAdjointB(value bool) SparseTensorDenseMatMulAttr {
+	return func(m optionalAttr) {
+		m["adjoint_b"] = value
+	}
+}
+
+// Multiply SparseTensor (of rank 2) "A" by dense matrix "B".
+//
+// No validity checking is performed on the indices of A.  However, the following
+// input format is recommended for optimal behavior:
+//
+// if adjoint_a == false:
+//   A should be sorted in lexicographically increasing order.  Use SparseReorder
+//   if you're not sure.
+// if adjoint_a == true:
+//   A should be sorted in order of increasing dimension 1 (i.e., "column major"
+//   order instead of "row major" order).
+//
+// Arguments:
+//	a_indices: 2-D.  The `indices` of the `SparseTensor`, size `[nnz, 2]` Matrix.
+//	a_values: 1-D.  The `values` of the `SparseTensor`, size `[nnz]` Vector.
+//	a_shape: 1-D.  The `shape` of the `SparseTensor`, size `[2]` Vector.
+//	b: 2-D.  A dense Matrix.
+func SparseTensorDenseMatMul(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b tf.Output, optional ...SparseTensorDenseMatMulAttr) (product tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -26519,9 +26685,9 @@ func BiasAdd(scope *Scope, value tf.Output, bias tf.Output, optional ...BiasAddA
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "BiasAdd",
+		Type: "SparseTensorDenseMatMul",
 		Input: []tf.Input{
-			value, bias,
+			a_indices, a_values, a_shape, b,
 		},
 		Attrs: attrs,
 	}
@@ -26529,6 +26695,86 @@ func BiasAdd(scope *Scope, value tf.Output, bias tf.Output, optional ...BiasAddA
 	return op.Output(0)
 }
 
+// FusedBatchNormGradV2Attr is an optional argument to FusedBatchNormGradV2.
+type FusedBatchNormGradV2Attr func(optionalAttr)
+
+// FusedBatchNormGradV2Epsilon sets the optional epsilon attribute to value.
+//
+// value: A small float number added to the variance of x.
+// If not specified, defaults to 0.0001
+func FusedBatchNormGradV2Epsilon(value float32) FusedBatchNormGradV2Attr {
+	return func(m optionalAttr) {
+		m["epsilon"] = value
+	}
+}
+
+// FusedBatchNormGradV2DataFormat sets the optional data_format attribute to value.
+//
+// value: The data format for y_backprop, x, x_backprop.
+// Either "NHWC" (default) or "NCHW".
+// If not specified, defaults to "NHWC"
+func FusedBatchNormGradV2DataFormat(value string) FusedBatchNormGradV2Attr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// FusedBatchNormGradV2IsTraining sets the optional is_training attribute to value.
+//
+// value: A bool value to indicate the operation is for training (default)
+// or inference.
+// If not specified, defaults to true
+func FusedBatchNormGradV2IsTraining(value bool) FusedBatchNormGradV2Attr {
+	return func(m optionalAttr) {
+		m["is_training"] = value
+	}
+}
+
+// Gradient for batch normalization.
+//
+// Note that the size of 4D Tensors are defined by either "NHWC" or "NCHW".
+// The size of 1D Tensors matches the dimension C of the 4D Tensors.
+//
+// Arguments:
+//	y_backprop: A 4D Tensor for the gradient with respect to y.
+//	x: A 4D Tensor for input data.
+//	scale: A 1D Tensor for scaling factor, to scale the normalized x.
+//	reserve_space_1: When is_training is True, a 1D Tensor for the computed batch
+// mean to be reused in gradient computation. When is_training is
+// False, a 1D Tensor for the population mean to be reused in both
+// 1st and 2nd order gradient computation.
+//	reserve_space_2: When is_training is True, a 1D Tensor for the computed batch
+// variance (inverted variance in the cuDNN case) to be reused in
+// gradient computation. When is_training is False, a 1D Tensor
+// for the population variance to be reused in both 1st and 2nd
+// order gradient computation.
+//
+// Returns:
+//	x_backprop: A 4D Tensor for the gradient with respect to x.
+//	scale_backprop: A 1D Tensor for the gradient with respect to scale.
+//	offset_backprop: A 1D Tensor for the gradient with respect to offset.
+//	reserve_space_3: Unused placeholder to match the mean input in FusedBatchNorm.
+//	reserve_space_4: Unused placeholder to match the variance input
+// in FusedBatchNorm.
+func FusedBatchNormGradV2(scope *Scope, y_backprop tf.Output, x tf.Output, scale tf.Output, reserve_space_1 tf.Output, reserve_space_2 tf.Output, optional ...FusedBatchNormGradV2Attr) (x_backprop tf.Output, scale_backprop tf.Output, offset_backprop tf.Output, reserve_space_3 tf.Output, reserve_space_4 tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "FusedBatchNormGradV2",
+		Input: []tf.Input{
+			y_backprop, x, scale, reserve_space_1, reserve_space_2,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
+}
+
 // Computes the gradient for the inverse of `x` wrt its input.
 //
 // Specifically, `grad = -dy * y*y`, where `y = 1/x`, and `dy`
@@ -26580,132 +26826,44 @@ func SparseDenseCwiseAdd(scope *Scope, sp_indices tf.Output, sp_values tf.Output
 	return op.Output(0)
 }
 
-// EncodeBase64Attr is an optional argument to EncodeBase64.
-type EncodeBase64Attr func(optionalAttr)
-
-// EncodeBase64Pad sets the optional pad attribute to value.
+// Counts the number of occurrences of each value in an integer array.
 //
-// value: Bool whether padding is applied at the ends.
-// If not specified, defaults to false
-func EncodeBase64Pad(value bool) EncodeBase64Attr {
-	return func(m optionalAttr) {
-		m["pad"] = value
-	}
-}
-
-// Encode strings into web-safe base64 format.
+// Outputs a vector with length `size` and the same dtype as `weights`. If
+// `weights` are empty, then index `i` stores the number of times the value `i` is
+// counted in `arr`. If `weights` are non-empty, then index `i` stores the sum of
+// the value in `weights` at each index where the corresponding value in `arr` is
+// `i`.
 //
-// Refer to the following article for more information on base64 format:
-// en.wikipedia.org/wiki/Base64. Base64 strings may have padding with '=' at the
-// end so that the encoded has length multiple of 4. See Padding section of the
-// link above.
-//
-// Web-safe means that the encoder uses - and _ instead of + and /.
+// Values in `arr` outside of the range [0, size) are ignored.
 //
 // Arguments:
-//	input: Strings to be encoded.
+//	arr: int32 `Tensor`.
+//	size: non-negative int32 scalar `Tensor`.
+//	weights: is an int32, int64, float32, or float64 `Tensor` with the same
+// shape as `arr`, or a length-0 `Tensor`, in which case it acts as all weights
+// equal to 1.
 //
-// Returns Input strings encoded in base64.
-func EncodeBase64(scope *Scope, input tf.Output, optional ...EncodeBase64Attr) (output tf.Output) {
+// Returns 1D `Tensor` with length equal to `size`. The counts or summed weights for
+// each value in the range [0, size).
+func Bincount(scope *Scope, arr tf.Output, size tf.Output, weights tf.Output) (bins tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "EncodeBase64",
+		Type: "Bincount",
 		Input: []tf.Input{
-			input,
+			arr, size, weights,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// FakeQuantWithMinMaxArgsAttr is an optional argument to FakeQuantWithMinMaxArgs.
-type FakeQuantWithMinMaxArgsAttr func(optionalAttr)
-
-// FakeQuantWithMinMaxArgsMin sets the optional min attribute to value.
-// If not specified, defaults to -6
-func FakeQuantWithMinMaxArgsMin(value float32) FakeQuantWithMinMaxArgsAttr {
-	return func(m optionalAttr) {
-		m["min"] = value
-	}
-}
-
-// FakeQuantWithMinMaxArgsMax sets the optional max attribute to value.
-// If not specified, defaults to 6
-func FakeQuantWithMinMaxArgsMax(value float32) FakeQuantWithMinMaxArgsAttr {
-	return func(m optionalAttr) {
-		m["max"] = value
-	}
-}
-
-// FakeQuantWithMinMaxArgsNumBits sets the optional num_bits attribute to value.
-// If not specified, defaults to 8
-func FakeQuantWithMinMaxArgsNumBits(value int64) FakeQuantWithMinMaxArgsAttr {
-	return func(m optionalAttr) {
-		m["num_bits"] = value
-	}
-}
-
-// FakeQuantWithMinMaxArgsNarrowRange sets the optional narrow_range attribute to value.
-// If not specified, defaults to false
-func FakeQuantWithMinMaxArgsNarrowRange(value bool) FakeQuantWithMinMaxArgsAttr {
-	return func(m optionalAttr) {
-		m["narrow_range"] = value
-	}
-}
-
-// Fake-quantize the 'inputs' tensor, type float to 'outputs' tensor of same type.
-//
-// Attributes
-//
-// *   `[min; max]` define the clamping range for the `inputs` data.
-// *   `inputs` values are quantized into the quantization range (
-// `[0; 2^num_bits - 1]` when `narrow_range` is false and `[1; 2^num_bits - 1]`
-// when it is true) and then de-quantized and output as floats in `[min; max]`
-// interval.
-// *   `num_bits` is the bitwidth of the quantization; between 2 and 16, inclusive.
-//
-// Before quantization, `min` and `max` values are adjusted with the following
-// logic.
-// It is suggested to have `min <= 0 <= max`. If `0` is not in the range of values,
-// the behavior can be unexpected:
-//
-// *   If `0 < min < max`: `min_adj = 0` and `max_adj = max - min`.
-// *   If `min < max < 0`: `min_adj = min - max` and `max_adj = 0`.
-// *   If `min <= 0 <= max`: `scale = (max - min) / (2^num_bits - 1) `,
-// `min_adj = scale * round(min / scale)` and `max_adj = max + min_adj - min`.
-//
-// Quantization is called fake since the output is still in floating point.
-func FakeQuantWithMinMaxArgs(scope *Scope, inputs tf.Output, optional ...FakeQuantWithMinMaxArgsAttr) (outputs tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "FakeQuantWithMinMaxArgs",
-		Input: []tf.Input{
-			inputs,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Batch normalization.
+// Gradients for batch normalization.
 //
 // DEPRECATED at GraphDef version 9: Use tf.nn.batch_normalization()
 //
-// This op is deprecated. Prefer `tf.nn.batch_normalization`.
+// This op is deprecated. See `tf.nn.batch_normalization`.
 //
 // Arguments:
 //	t: A 4D input Tensor.
@@ -26715,389 +26873,34 @@ func FakeQuantWithMinMaxArgs(scope *Scope, inputs tf.Output, optional ...FakeQua
 //	v: A 1D variance Tensor with size matching the last dimension of t.
 // This is the second output from tf.nn.moments,
 // or a saved moving average thereof.
-//	beta: A 1D beta Tensor with size matching the last dimension of t.
-// An offset to be added to the normalized tensor.
 //	gamma: A 1D gamma Tensor with size matching the last dimension of t.
-// If "scale_after_normalization" is true, this tensor will be multiplied
-// with the normalized tensor.
+// If "scale_after_normalization" is true, this Tensor will be multiplied
+// with the normalized Tensor.
+//	backprop: 4D backprop Tensor.
 //	variance_epsilon: A small float number to avoid dividing by 0.
 //	scale_after_normalization: A bool indicating whether the resulted tensor
 // needs to be multiplied with gamma.
-func BatchNormWithGlobalNormalization(scope *Scope, t tf.Output, m tf.Output, v tf.Output, beta tf.Output, gamma tf.Output, variance_epsilon float32, scale_after_normalization bool) (result tf.Output) {
+//
+// Returns:
+//	dx: 4D backprop tensor for input.
+//	dm: 1D backprop tensor for mean.
+//	dv: 1D backprop tensor for variance.
+//	db: 1D backprop tensor for beta.
+//	dg: 1D backprop tensor for gamma.
+func BatchNormWithGlobalNormalizationGrad(scope *Scope, t tf.Output, m tf.Output, v tf.Output, gamma tf.Output, backprop tf.Output, variance_epsilon float32, scale_after_normalization bool) (dx tf.Output, dm tf.Output, dv tf.Output, db tf.Output, dg tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	attrs := map[string]interface{}{"variance_epsilon": variance_epsilon, "scale_after_normalization": scale_after_normalization}
 	opspec := tf.OpSpec{
-		Type: "BatchNormWithGlobalNormalization",
+		Type: "BatchNormWithGlobalNormalizationGrad",
 		Input: []tf.Input{
-			t, m, v, beta, gamma,
+			t, m, v, gamma, backprop,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// QuantizedConv2DAttr is an optional argument to QuantizedConv2D.
-type QuantizedConv2DAttr func(optionalAttr)
-
-// QuantizedConv2DOutType sets the optional out_type attribute to value.
-// If not specified, defaults to DT_QINT32
-func QuantizedConv2DOutType(value tf.DataType) QuantizedConv2DAttr {
-	return func(m optionalAttr) {
-		m["out_type"] = value
-	}
-}
-
-// QuantizedConv2DDilations sets the optional dilations attribute to value.
-//
-// value: 1-D tensor of length 4.  The dilation factor for each dimension of
-// `input`. If set to k > 1, there will be k-1 skipped cells between each
-// filter element on that dimension. The dimension order is determined by the
-// value of `data_format`, see above for details. Dilations in the batch and
-// depth dimensions must be 1.
-// If not specified, defaults to <i:1 i:1 i:1 i:1 >
-func QuantizedConv2DDilations(value []int64) QuantizedConv2DAttr {
-	return func(m optionalAttr) {
-		m["dilations"] = value
-	}
-}
-
-// Computes a 2D convolution given quantized 4D input and filter tensors.
-//
-// The inputs are quantized tensors where the lowest value represents the real
-// number of the associated minimum, and the highest represents the maximum.
-// This means that you can only interpret the quantized output in the same way, by
-// taking the returned minimum and maximum values into account.
-//
-// Arguments:
-//
-//	filter: filter's input_depth dimension must match input's depth dimensions.
-//	min_input: The float value that the lowest quantized input value represents.
-//	max_input: The float value that the highest quantized input value represents.
-//	min_filter: The float value that the lowest quantized filter value represents.
-//	max_filter: The float value that the highest quantized filter value represents.
-//	strides: The stride of the sliding window for each dimension of the input
-// tensor.
-//	padding: The type of padding algorithm to use.
-//
-// Returns:
-//	output
-//	min_output: The float value that the lowest quantized output value represents.
-//	max_output: The float value that the highest quantized output value represents.
-func QuantizedConv2D(scope *Scope, input tf.Output, filter tf.Output, min_input tf.Output, max_input tf.Output, min_filter tf.Output, max_filter tf.Output, strides []int64, padding string, optional ...QuantizedConv2DAttr) (output tf.Output, min_output tf.Output, max_output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "QuantizedConv2D",
-		Input: []tf.Input{
-			input, filter, min_input, max_input, min_filter, max_filter,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// Computes rectified linear 6 gradients for a Relu6 operation.
-//
-// Arguments:
-//	gradients: The backpropagated gradients to the corresponding Relu6 operation.
-//	features: The features passed as input to the corresponding Relu6 operation, or
-// its output; using either one produces the same result.
-//
-// Returns The gradients:
-// `gradients * (features > 0) * (features < 6)`.
-func Relu6Grad(scope *Scope, gradients tf.Output, features tf.Output) (backprops tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Relu6Grad",
-		Input: []tf.Input{
-			gradients, features,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// StringSplitAttr is an optional argument to StringSplit.
-type StringSplitAttr func(optionalAttr)
-
-// StringSplitSkipEmpty sets the optional skip_empty attribute to value.
-//
-// value: A `bool`. If `True`, skip the empty strings from the result.
-// If not specified, defaults to true
-func StringSplitSkipEmpty(value bool) StringSplitAttr {
-	return func(m optionalAttr) {
-		m["skip_empty"] = value
-	}
-}
-
-// Split elements of `input` based on `delimiter` into a `SparseTensor`.
-//
-// Let N be the size of source (typically N will be the batch size). Split each
-// element of `input` based on `delimiter` and return a `SparseTensor`
-// containing the splitted tokens. Empty tokens are ignored.
-//
-// `delimiter` can be empty, or a string of split characters. If `delimiter` is an
-//  empty string, each element of `input` is split into individual single-byte
-//  character strings, including splitting of UTF-8 multibyte sequences. Otherwise
-//  every character of `delimiter` is a potential split point.
-//
-// For example:
-//   N = 2, input[0] is 'hello world' and input[1] is 'a b c', then the output
-//   will be
-//
-//   indices = [0, 0;
-//              0, 1;
-//              1, 0;
-//              1, 1;
-//              1, 2]
-//   shape = [2, 3]
-//   values = ['hello', 'world', 'a', 'b', 'c']
-//
-// Arguments:
-//	input: 1-D. Strings to split.
-//	delimiter: 0-D. Delimiter characters (bytes), or empty string.
-//
-// Returns:
-//	indices: A dense matrix of int64 representing the indices of the sparse tensor.
-//	values: A vector of strings corresponding to the splited values.
-//	shape: a length-2 vector of int64 representing the shape of the sparse
-// tensor, where the first value is N and the second value is the maximum number
-// of tokens in a single input entry.
-func StringSplit(scope *Scope, input tf.Output, delimiter tf.Output, optional ...StringSplitAttr) (indices tf.Output, values tf.Output, shape tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "StringSplit",
-		Input: []tf.Input{
-			input, delimiter,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// Assigns sparse updates to the variable referenced by `resource`.
-//
-// This operation computes
-//
-//     # Scalar indices
-//     ref[indices, ...] = updates[...]
-//
-//     # Vector indices (for each i)
-//     ref[indices[i], ...] = updates[i, ...]
-//
-//     # High rank indices (for each i, ..., j)
-//     ref[indices[i, ..., j], ...] = updates[i, ..., j, ...]
-//
-// Arguments:
-//	resource: Should be from a `Variable` node.
-//	indices: A tensor of indices into the first dimension of `ref`.
-//	updates: A tensor of updated values to add to `ref`.
-//
-// Returns the created operation.
-func ResourceScatterUpdate(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceScatterUpdate",
-		Input: []tf.Input{
-			resource, indices, updates,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
-// EditDistanceAttr is an optional argument to EditDistance.
-type EditDistanceAttr func(optionalAttr)
-
-// EditDistanceNormalize sets the optional normalize attribute to value.
-//
-// value: boolean (if true, edit distances are normalized by length of truth).
-//
-// The output is:
-// If not specified, defaults to true
-func EditDistanceNormalize(value bool) EditDistanceAttr {
-	return func(m optionalAttr) {
-		m["normalize"] = value
-	}
-}
-
-// Computes the (possibly normalized) Levenshtein Edit Distance.
-//
-// The inputs are variable-length sequences provided by SparseTensors
-//   (hypothesis_indices, hypothesis_values, hypothesis_shape)
-// and
-//   (truth_indices, truth_values, truth_shape).
-//
-// The inputs are:
-//
-// Arguments:
-//	hypothesis_indices: The indices of the hypothesis list SparseTensor.
-// This is an N x R int64 matrix.
-//	hypothesis_values: The values of the hypothesis list SparseTensor.
-// This is an N-length vector.
-//	hypothesis_shape: The shape of the hypothesis list SparseTensor.
-// This is an R-length vector.
-//	truth_indices: The indices of the truth list SparseTensor.
-// This is an M x R int64 matrix.
-//	truth_values: The values of the truth list SparseTensor.
-// This is an M-length vector.
-//	truth_shape: truth indices, vector.
-//
-// Returns A dense float tensor with rank R - 1.
-//
-// For the example input:
-//
-//     // hypothesis represents a 2x1 matrix with variable-length values:
-//     //   (0,0) = ["a"]
-//     //   (1,0) = ["b"]
-//     hypothesis_indices = [[0, 0, 0],
-//                           [1, 0, 0]]
-//     hypothesis_values = ["a", "b"]
-//     hypothesis_shape = [2, 1, 1]
-//
-//     // truth represents a 2x2 matrix with variable-length values:
-//     //   (0,0) = []
-//     //   (0,1) = ["a"]
-//     //   (1,0) = ["b", "c"]
-//     //   (1,1) = ["a"]
-//     truth_indices = [[0, 1, 0],
-//                      [1, 0, 0],
-//                      [1, 0, 1],
-//                      [1, 1, 0]]
-//     truth_values = ["a", "b", "c", "a"]
-//     truth_shape = [2, 2, 2]
-//     normalize = true
-//
-// The output will be:
-//
-//     // output is a 2x2 matrix with edit distances normalized by truth lengths.
-//     output = [[inf, 1.0],  // (0,0): no truth, (0,1): no hypothesis
-//               [0.5, 1.0]]  // (1,0): addition, (1,1): no hypothesis
-func EditDistance(scope *Scope, hypothesis_indices tf.Output, hypothesis_values tf.Output, hypothesis_shape tf.Output, truth_indices tf.Output, truth_values tf.Output, truth_shape tf.Output, optional ...EditDistanceAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "EditDistance",
-		Input: []tf.Input{
-			hypothesis_indices, hypothesis_values, hypothesis_shape, truth_indices, truth_values, truth_shape,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Concatenates a list of `N` tensors along the first dimension.
-//
-// The input tensors are all required to have size 1 in the first dimension.
-//
-// For example:
-//
-// ```
-// # 'x' is [[1, 4]]
-// # 'y' is [[2, 5]]
-// # 'z' is [[3, 6]]
-// parallel_concat([x, y, z]) => [[1, 4], [2, 5], [3, 6]]  # Pack along first dim.
-// ```
-//
-// The difference between concat and parallel_concat is that concat requires all
-// of the inputs be computed before the operation will begin but doesn't require
-// that the input shapes be known during graph construction.  Parallel concat
-// will copy pieces of the input into the output as they become available, in
-// some situations this can provide a performance benefit.
-//
-// Arguments:
-//	values: Tensors to be concatenated. All must have size 1 in the first dimension
-// and same shape.
-//	shape: the final shape of the result; should be equal to the shapes of any input
-// but with the number of input values in the first dimension.
-//
-// Returns The concatenated tensor.
-func ParallelConcat(scope *Scope, values []tf.Output, shape tf.Shape) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"shape": shape}
-	opspec := tf.OpSpec{
-		Type: "ParallelConcat",
-		Input: []tf.Input{
-			tf.OutputList(values),
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// AvgPoolGradAttr is an optional argument to AvgPoolGrad.
-type AvgPoolGradAttr func(optionalAttr)
-
-// AvgPoolGradDataFormat sets the optional data_format attribute to value.
-//
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// If not specified, defaults to "NHWC"
-func AvgPoolGradDataFormat(value string) AvgPoolGradAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// Computes gradients of the average pooling function.
-//
-// Arguments:
-//	orig_input_shape: 1-D.  Shape of the original input to `avg_pool`.
-//	grad: 4-D with shape `[batch, height, width, channels]`.  Gradients w.r.t.
-// the output of `avg_pool`.
-//	ksize: The size of the sliding window for each dimension of the input.
-//	strides: The stride of the sliding window for each dimension of the input.
-//	padding: The type of padding algorithm to use.
-//
-// Returns 4-D.  Gradients w.r.t. the input of `avg_pool`.
-func AvgPoolGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...AvgPoolGradAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "AvgPoolGrad",
-		Input: []tf.Input{
-			orig_input_shape, grad,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
 }
 
 // LoadAndRemapMatrixAttr is an optional argument to LoadAndRemapMatrix.
@@ -28344,53 +28147,6 @@ func SaveSlices(scope *Scope, filename tf.Output, tensor_names tf.Output, shapes
 	return scope.AddOperation(opspec)
 }
 
-// AvgPool3DAttr is an optional argument to AvgPool3D.
-type AvgPool3DAttr func(optionalAttr)
-
-// AvgPool3DDataFormat sets the optional data_format attribute to value.
-//
-// value: The data format of the input and output data. With the
-// default format "NDHWC", the data is stored in the order of:
-//     [batch, in_depth, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCDHW", the data storage order is:
-//     [batch, in_channels, in_depth, in_height, in_width].
-// If not specified, defaults to "NDHWC"
-func AvgPool3DDataFormat(value string) AvgPool3DAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// Performs 3D average pooling on the input.
-//
-// Arguments:
-//	input: Shape `[batch, depth, rows, cols, channels]` tensor to pool over.
-//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
-// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
-//	strides: 1-D tensor of length 5. The stride of the sliding window for each
-// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-//	padding: The type of padding algorithm to use.
-//
-// Returns The average pooled output tensor.
-func AvgPool3D(scope *Scope, input tf.Output, ksize []int64, strides []int64, padding string, optional ...AvgPool3DAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "AvgPool3D",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // FusedBatchNormGradV3Attr is an optional argument to FusedBatchNormGradV3.
 type FusedBatchNormGradV3Attr func(optionalAttr)
 
@@ -28474,6 +28230,53 @@ func FusedBatchNormGradV3(scope *Scope, y_backprop tf.Output, x tf.Output, scale
 	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
 }
 
+// AvgPool3DAttr is an optional argument to AvgPool3D.
+type AvgPool3DAttr func(optionalAttr)
+
+// AvgPool3DDataFormat sets the optional data_format attribute to value.
+//
+// value: The data format of the input and output data. With the
+// default format "NDHWC", the data is stored in the order of:
+//     [batch, in_depth, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCDHW", the data storage order is:
+//     [batch, in_channels, in_depth, in_height, in_width].
+// If not specified, defaults to "NDHWC"
+func AvgPool3DDataFormat(value string) AvgPool3DAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Performs 3D average pooling on the input.
+//
+// Arguments:
+//	input: Shape `[batch, depth, rows, cols, channels]` tensor to pool over.
+//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
+// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	padding: The type of padding algorithm to use.
+//
+// Returns The average pooled output tensor.
+func AvgPool3D(scope *Scope, input tf.Output, ksize []int64, strides []int64, padding string, optional ...AvgPool3DAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "AvgPool3D",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Returns the number of records this Reader has produced.
 //
 // This is the same as the number of ReaderRead executions that have
@@ -29902,6 +29705,247 @@ func Neg(scope *Scope, x tf.Output) (y tf.Output) {
 	return op.Output(0)
 }
 
+// Concatenates tensors along one dimension.
+//
+// Arguments:
+//	values: List of `N` Tensors to concatenate. Their ranks and types must match,
+// and their sizes must match in all dimensions except `concat_dim`.
+//	axis: 0-D.  The dimension along which to concatenate.  Must be in the
+// range [-rank(values), rank(values)).
+//
+// Returns A `Tensor` with the concatenation of values stacked along the
+// `concat_dim` dimension.  This tensor's shape matches that of `values` except
+// in `concat_dim` where it has the sum of the sizes.
+func ConcatV2(scope *Scope, values []tf.Output, axis tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ConcatV2",
+		Input: []tf.Input{
+			tf.OutputList(values), axis,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Elementwise computes the bitwise right-shift of `x` and `y`.
+//
+// Performs a logical shift for unsigned integer types, and an arithmetic shift
+// for signed integer types.
+//
+// If `y` is negative, or greater than or equal to than the width of `x` in bits
+// the result is implementation defined.
+//
+// Example:
+//
+// ```python
+// import tensorflow as tf
+// from tensorflow.python.ops import bitwise_ops
+// import numpy as np
+// dtype_list = [tf.int8, tf.int16, tf.int32, tf.int64]
+//
+// for dtype in dtype_list:
+//   lhs = tf.constant([-1, -5, -3, -14], dtype=dtype)
+//   rhs = tf.constant([5, 0, 7, 11], dtype=dtype)
+//
+//   right_shift_result = bitwise_ops.right_shift(lhs, rhs)
+//
+//   print(right_shift_result)
+//
+// # This will print:
+// # tf.Tensor([-1 -5 -1 -1], shape=(4,), dtype=int8)
+// # tf.Tensor([-1 -5 -1 -1], shape=(4,), dtype=int16)
+// # tf.Tensor([-1 -5 -1 -1], shape=(4,), dtype=int32)
+// # tf.Tensor([-1 -5 -1 -1], shape=(4,), dtype=int64)
+//
+// lhs = np.array([-2, 64, 101, 32], dtype=np.int8)
+// rhs = np.array([-1, -5, -3, -14], dtype=np.int8)
+// bitwise_ops.right_shift(lhs, rhs)
+// # <tf.Tensor: shape=(4,), dtype=int8, numpy=array([ -2,  64, 101,  32], dtype=int8)>
+// ```
+//
+func RightShift(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "RightShift",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// IteratorFromStringHandleAttr is an optional argument to IteratorFromStringHandle.
+type IteratorFromStringHandleAttr func(optionalAttr)
+
+// IteratorFromStringHandleOutputTypes sets the optional output_types attribute to value.
+//
+// value: If specified, defines the type of each tuple component in an
+// element produced by the resulting iterator.
+// If not specified, defaults to <>
+//
+// REQUIRES: len(value) >= 0
+func IteratorFromStringHandleOutputTypes(value []tf.DataType) IteratorFromStringHandleAttr {
+	return func(m optionalAttr) {
+		m["output_types"] = value
+	}
+}
+
+// IteratorFromStringHandleOutputShapes sets the optional output_shapes attribute to value.
+//
+// value: If specified, defines the shape of each tuple component in an
+// element produced by the resulting iterator.
+// If not specified, defaults to <>
+//
+// REQUIRES: len(value) >= 0
+func IteratorFromStringHandleOutputShapes(value []tf.Shape) IteratorFromStringHandleAttr {
+	return func(m optionalAttr) {
+		m["output_shapes"] = value
+	}
+}
+
+// Converts the given string representing a handle to an iterator to a resource.
+//
+// Arguments:
+//	string_handle: A string representation of the given handle.
+//
+// Returns A handle to an iterator resource.
+func IteratorFromStringHandle(scope *Scope, string_handle tf.Output, optional ...IteratorFromStringHandleAttr) (resource_handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "IteratorFromStringHandle",
+		Input: []tf.Input{
+			string_handle,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Performs gradient updates of embedding tables.
+//
+// Arguments:
+//	inputs: A TensorList of gradients with which to update embedding tables.
+// This argument has the same length and shapes as the return value of
+// RecvTPUEmbeddingActivations, but contains gradients of the model's loss
+// with respect to the embedding activations. The embedding tables are updated
+// from these gradients via the optimizer specified in the TPU embedding
+// configuration given to tpu.initialize_system.
+//	learning_rates: A TensorList of float32 scalars, one for each dynamic learning
+// rate tag: see the comments in
+// //third_party/tensorflow/core/protobuf/tpu/optimization_parameters.proto.
+// Multiple tables can share the same dynamic learning rate tag as specified
+// in the configuration. If the learning rates for all tables are constant,
+// this list should be empty.
+//	config: Serialized TPUEmbeddingConfiguration proto.
+//
+// Returns the created operation.
+func SendTPUEmbeddingGradients(scope *Scope, inputs []tf.Output, learning_rates []tf.Output, config string) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"config": config}
+	opspec := tf.OpSpec{
+		Type: "SendTPUEmbeddingGradients",
+		Input: []tf.Input{
+			tf.OutputList(inputs), tf.OutputList(learning_rates),
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// CumsumAttr is an optional argument to Cumsum.
+type CumsumAttr func(optionalAttr)
+
+// CumsumExclusive sets the optional exclusive attribute to value.
+//
+// value: If `True`, perform exclusive cumsum.
+// If not specified, defaults to false
+func CumsumExclusive(value bool) CumsumAttr {
+	return func(m optionalAttr) {
+		m["exclusive"] = value
+	}
+}
+
+// CumsumReverse sets the optional reverse attribute to value.
+//
+// value: A `bool` (default: False).
+// If not specified, defaults to false
+func CumsumReverse(value bool) CumsumAttr {
+	return func(m optionalAttr) {
+		m["reverse"] = value
+	}
+}
+
+// Compute the cumulative sum of the tensor `x` along `axis`.
+//
+// By default, this op performs an inclusive cumsum, which means that the first
+// element of the input is identical to the first element of the output:
+//
+// ```python
+// tf.cumsum([a, b, c])  # => [a, a + b, a + b + c]
+// ```
+//
+// By setting the `exclusive` kwarg to `True`, an exclusive cumsum is
+// performed instead:
+//
+// ```python
+// tf.cumsum([a, b, c], exclusive=True)  # => [0, a, a + b]
+// ```
+//
+// By setting the `reverse` kwarg to `True`, the cumsum is performed in the
+// opposite direction:
+//
+// ```python
+// tf.cumsum([a, b, c], reverse=True)  # => [a + b + c, b + c, c]
+// ```
+//
+// This is more efficient than using separate `tf.reverse` ops.
+//
+// The `reverse` and `exclusive` kwargs can also be combined:
+//
+// ```python
+// tf.cumsum([a, b, c], exclusive=True, reverse=True)  # => [b + c, c, 0]
+// ```
+//
+// Arguments:
+//	x: A `Tensor`. Must be one of the following types: `float32`, `float64`,
+// `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`,
+// `complex128`, `qint8`, `quint8`, `qint32`, `half`.
+//	axis: A `Tensor` of type `int32` (default: 0). Must be in the range
+// `[-rank(x), rank(x))`.
+func Cumsum(scope *Scope, x tf.Output, axis tf.Output, optional ...CumsumAttr) (out tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Cumsum",
+		Input: []tf.Input{
+			x, axis,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Calculates the gradient of the SparseMatrixSoftmax op.
 //
 // Arguments:
@@ -30416,6 +30460,367 @@ func StatefulUniform(scope *Scope, resource tf.Output, algorithm tf.Output, shap
 	return op.Output(0)
 }
 
+// QuantizedConv2DAttr is an optional argument to QuantizedConv2D.
+type QuantizedConv2DAttr func(optionalAttr)
+
+// QuantizedConv2DOutType sets the optional out_type attribute to value.
+// If not specified, defaults to DT_QINT32
+func QuantizedConv2DOutType(value tf.DataType) QuantizedConv2DAttr {
+	return func(m optionalAttr) {
+		m["out_type"] = value
+	}
+}
+
+// QuantizedConv2DDilations sets the optional dilations attribute to value.
+//
+// value: 1-D tensor of length 4.  The dilation factor for each dimension of
+// `input`. If set to k > 1, there will be k-1 skipped cells between each
+// filter element on that dimension. The dimension order is determined by the
+// value of `data_format`, see above for details. Dilations in the batch and
+// depth dimensions must be 1.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
+func QuantizedConv2DDilations(value []int64) QuantizedConv2DAttr {
+	return func(m optionalAttr) {
+		m["dilations"] = value
+	}
+}
+
+// Computes a 2D convolution given quantized 4D input and filter tensors.
+//
+// The inputs are quantized tensors where the lowest value represents the real
+// number of the associated minimum, and the highest represents the maximum.
+// This means that you can only interpret the quantized output in the same way, by
+// taking the returned minimum and maximum values into account.
+//
+// Arguments:
+//
+//	filter: filter's input_depth dimension must match input's depth dimensions.
+//	min_input: The float value that the lowest quantized input value represents.
+//	max_input: The float value that the highest quantized input value represents.
+//	min_filter: The float value that the lowest quantized filter value represents.
+//	max_filter: The float value that the highest quantized filter value represents.
+//	strides: The stride of the sliding window for each dimension of the input
+// tensor.
+//	padding: The type of padding algorithm to use.
+//
+// Returns:
+//	output
+//	min_output: The float value that the lowest quantized output value represents.
+//	max_output: The float value that the highest quantized output value represents.
+func QuantizedConv2D(scope *Scope, input tf.Output, filter tf.Output, min_input tf.Output, max_input tf.Output, min_filter tf.Output, max_filter tf.Output, strides []int64, padding string, optional ...QuantizedConv2DAttr) (output tf.Output, min_output tf.Output, max_output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "QuantizedConv2D",
+		Input: []tf.Input{
+			input, filter, min_input, max_input, min_filter, max_filter,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Computes rectified linear 6 gradients for a Relu6 operation.
+//
+// Arguments:
+//	gradients: The backpropagated gradients to the corresponding Relu6 operation.
+//	features: The features passed as input to the corresponding Relu6 operation, or
+// its output; using either one produces the same result.
+//
+// Returns The gradients:
+// `gradients * (features > 0) * (features < 6)`.
+func Relu6Grad(scope *Scope, gradients tf.Output, features tf.Output) (backprops tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Relu6Grad",
+		Input: []tf.Input{
+			gradients, features,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// EditDistanceAttr is an optional argument to EditDistance.
+type EditDistanceAttr func(optionalAttr)
+
+// EditDistanceNormalize sets the optional normalize attribute to value.
+//
+// value: boolean (if true, edit distances are normalized by length of truth).
+//
+// The output is:
+// If not specified, defaults to true
+func EditDistanceNormalize(value bool) EditDistanceAttr {
+	return func(m optionalAttr) {
+		m["normalize"] = value
+	}
+}
+
+// Computes the (possibly normalized) Levenshtein Edit Distance.
+//
+// The inputs are variable-length sequences provided by SparseTensors
+//   (hypothesis_indices, hypothesis_values, hypothesis_shape)
+// and
+//   (truth_indices, truth_values, truth_shape).
+//
+// The inputs are:
+//
+// Arguments:
+//	hypothesis_indices: The indices of the hypothesis list SparseTensor.
+// This is an N x R int64 matrix.
+//	hypothesis_values: The values of the hypothesis list SparseTensor.
+// This is an N-length vector.
+//	hypothesis_shape: The shape of the hypothesis list SparseTensor.
+// This is an R-length vector.
+//	truth_indices: The indices of the truth list SparseTensor.
+// This is an M x R int64 matrix.
+//	truth_values: The values of the truth list SparseTensor.
+// This is an M-length vector.
+//	truth_shape: truth indices, vector.
+//
+// Returns A dense float tensor with rank R - 1.
+//
+// For the example input:
+//
+//     // hypothesis represents a 2x1 matrix with variable-length values:
+//     //   (0,0) = ["a"]
+//     //   (1,0) = ["b"]
+//     hypothesis_indices = [[0, 0, 0],
+//                           [1, 0, 0]]
+//     hypothesis_values = ["a", "b"]
+//     hypothesis_shape = [2, 1, 1]
+//
+//     // truth represents a 2x2 matrix with variable-length values:
+//     //   (0,0) = []
+//     //   (0,1) = ["a"]
+//     //   (1,0) = ["b", "c"]
+//     //   (1,1) = ["a"]
+//     truth_indices = [[0, 1, 0],
+//                      [1, 0, 0],
+//                      [1, 0, 1],
+//                      [1, 1, 0]]
+//     truth_values = ["a", "b", "c", "a"]
+//     truth_shape = [2, 2, 2]
+//     normalize = true
+//
+// The output will be:
+//
+//     // output is a 2x2 matrix with edit distances normalized by truth lengths.
+//     output = [[inf, 1.0],  // (0,0): no truth, (0,1): no hypothesis
+//               [0.5, 1.0]]  // (1,0): addition, (1,1): no hypothesis
+func EditDistance(scope *Scope, hypothesis_indices tf.Output, hypothesis_values tf.Output, hypothesis_shape tf.Output, truth_indices tf.Output, truth_values tf.Output, truth_shape tf.Output, optional ...EditDistanceAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "EditDistance",
+		Input: []tf.Input{
+			hypothesis_indices, hypothesis_values, hypothesis_shape, truth_indices, truth_values, truth_shape,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Concatenates a list of `N` tensors along the first dimension.
+//
+// The input tensors are all required to have size 1 in the first dimension.
+//
+// For example:
+//
+// ```
+// # 'x' is [[1, 4]]
+// # 'y' is [[2, 5]]
+// # 'z' is [[3, 6]]
+// parallel_concat([x, y, z]) => [[1, 4], [2, 5], [3, 6]]  # Pack along first dim.
+// ```
+//
+// The difference between concat and parallel_concat is that concat requires all
+// of the inputs be computed before the operation will begin but doesn't require
+// that the input shapes be known during graph construction.  Parallel concat
+// will copy pieces of the input into the output as they become available, in
+// some situations this can provide a performance benefit.
+//
+// Arguments:
+//	values: Tensors to be concatenated. All must have size 1 in the first dimension
+// and same shape.
+//	shape: the final shape of the result; should be equal to the shapes of any input
+// but with the number of input values in the first dimension.
+//
+// Returns The concatenated tensor.
+func ParallelConcat(scope *Scope, values []tf.Output, shape tf.Shape) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"shape": shape}
+	opspec := tf.OpSpec{
+		Type: "ParallelConcat",
+		Input: []tf.Input{
+			tf.OutputList(values),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// AvgPoolGradAttr is an optional argument to AvgPoolGrad.
+type AvgPoolGradAttr func(optionalAttr)
+
+// AvgPoolGradDataFormat sets the optional data_format attribute to value.
+//
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// If not specified, defaults to "NHWC"
+func AvgPoolGradDataFormat(value string) AvgPoolGradAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Computes gradients of the average pooling function.
+//
+// Arguments:
+//	orig_input_shape: 1-D.  Shape of the original input to `avg_pool`.
+//	grad: 4-D with shape `[batch, height, width, channels]`.  Gradients w.r.t.
+// the output of `avg_pool`.
+//	ksize: The size of the sliding window for each dimension of the input.
+//	strides: The stride of the sliding window for each dimension of the input.
+//	padding: The type of padding algorithm to use.
+//
+// Returns 4-D.  Gradients w.r.t. the input of `avg_pool`.
+func AvgPoolGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...AvgPoolGradAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "AvgPoolGrad",
+		Input: []tf.Input{
+			orig_input_shape, grad,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// StringSplitAttr is an optional argument to StringSplit.
+type StringSplitAttr func(optionalAttr)
+
+// StringSplitSkipEmpty sets the optional skip_empty attribute to value.
+//
+// value: A `bool`. If `True`, skip the empty strings from the result.
+// If not specified, defaults to true
+func StringSplitSkipEmpty(value bool) StringSplitAttr {
+	return func(m optionalAttr) {
+		m["skip_empty"] = value
+	}
+}
+
+// Split elements of `input` based on `delimiter` into a `SparseTensor`.
+//
+// Let N be the size of source (typically N will be the batch size). Split each
+// element of `input` based on `delimiter` and return a `SparseTensor`
+// containing the splitted tokens. Empty tokens are ignored.
+//
+// `delimiter` can be empty, or a string of split characters. If `delimiter` is an
+//  empty string, each element of `input` is split into individual single-byte
+//  character strings, including splitting of UTF-8 multibyte sequences. Otherwise
+//  every character of `delimiter` is a potential split point.
+//
+// For example:
+//   N = 2, input[0] is 'hello world' and input[1] is 'a b c', then the output
+//   will be
+//
+//   indices = [0, 0;
+//              0, 1;
+//              1, 0;
+//              1, 1;
+//              1, 2]
+//   shape = [2, 3]
+//   values = ['hello', 'world', 'a', 'b', 'c']
+//
+// Arguments:
+//	input: 1-D. Strings to split.
+//	delimiter: 0-D. Delimiter characters (bytes), or empty string.
+//
+// Returns:
+//	indices: A dense matrix of int64 representing the indices of the sparse tensor.
+//	values: A vector of strings corresponding to the splited values.
+//	shape: a length-2 vector of int64 representing the shape of the sparse
+// tensor, where the first value is N and the second value is the maximum number
+// of tokens in a single input entry.
+func StringSplit(scope *Scope, input tf.Output, delimiter tf.Output, optional ...StringSplitAttr) (indices tf.Output, values tf.Output, shape tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "StringSplit",
+		Input: []tf.Input{
+			input, delimiter,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Assigns sparse updates to the variable referenced by `resource`.
+//
+// This operation computes
+//
+//     # Scalar indices
+//     ref[indices, ...] = updates[...]
+//
+//     # Vector indices (for each i)
+//     ref[indices[i], ...] = updates[i, ...]
+//
+//     # High rank indices (for each i, ..., j)
+//     ref[indices[i, ..., j], ...] = updates[i, ..., j, ...]
+//
+// Arguments:
+//	resource: Should be from a `Variable` node.
+//	indices: A tensor of indices into the first dimension of `ref`.
+//	updates: A tensor of updated values to add to `ref`.
+//
+// Returns the created operation.
+func ResourceScatterUpdate(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceScatterUpdate",
+		Input: []tf.Input{
+			resource, indices, updates,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
 // Creates ngrams from ragged string data.
 //
 // This op accepts a ragged tensor with 1 ragged dimension containing only
@@ -30903,32 +31308,49 @@ func ConsumeMutexLock(scope *Scope, mutex_lock tf.Output) (o *tf.Operation) {
 	return scope.AddOperation(opspec)
 }
 
-// Adjust the contrast of one or more images.
+// BiasAddAttr is an optional argument to BiasAdd.
+type BiasAddAttr func(optionalAttr)
+
+// BiasAddDataFormat sets the optional data_format attribute to value.
 //
-// `images` is a tensor of at least 3 dimensions.  The last 3 dimensions are
-// interpreted as `[height, width, channels]`.  The other dimensions only
-// represent a collection of images, such as `[batch, height, width, channels].`
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the bias tensor will be added to the last dimension
+// of the value tensor.
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// The tensor will be added to "in_channels", the third-to-the-last
+//     dimension.
+// If not specified, defaults to "NHWC"
+func BiasAddDataFormat(value string) BiasAddAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Adds `bias` to `value`.
 //
-// Contrast is adjusted independently for each channel of each image.
-//
-// For each channel, the Op first computes the mean of the image pixels in the
-// channel and then adjusts each component of each pixel to
-// `(x - mean) * contrast_factor + mean`.
+// This is a special case of `tf.add` where `bias` is restricted to be 1-D.
+// Broadcasting is supported, so `value` may have any number of dimensions.
 //
 // Arguments:
-//	images: Images to adjust.  At least 3-D.
-//	contrast_factor: A float multiplier for adjusting contrast.
+//	value: Any number of dimensions.
+//	bias: 1-D with size the last dimension of `value`.
 //
-// Returns The contrast-adjusted image or images.
-func AdjustContrastv2(scope *Scope, images tf.Output, contrast_factor tf.Output) (output tf.Output) {
+// Returns Broadcasted sum of `value` and `bias`.
+func BiasAdd(scope *Scope, value tf.Output, bias tf.Output, optional ...BiasAddAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "AdjustContrastv2",
+		Type: "BiasAdd",
 		Input: []tf.Input{
-			images, contrast_factor,
+			value, bias,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
@@ -31433,8 +31855,8 @@ func VarHandleOpSharedName(value string) VarHandleOpAttr {
 
 // VarHandleOpAllowedDevices sets the optional allowed_devices attribute to value.
 //
-// value: The allowed devices containing the resource variable. Set when the output
-// ResourceHandle represents a per-replica/partitioned resource variable.
+// value: DEPRECATED. The allowed devices containing the resource variable. Set when the
+// output ResourceHandle represents a per-replica/partitioned resource variable.
 // If not specified, defaults to <>
 func VarHandleOpAllowedDevices(value []string) VarHandleOpAttr {
 	return func(m optionalAttr) {
@@ -33112,60 +33534,6 @@ func CSRSparseMatrixToDense(scope *Scope, sparse_input tf.Output, type_ tf.DataT
 	return op.Output(0)
 }
 
-// IteratorFromStringHandleAttr is an optional argument to IteratorFromStringHandle.
-type IteratorFromStringHandleAttr func(optionalAttr)
-
-// IteratorFromStringHandleOutputTypes sets the optional output_types attribute to value.
-//
-// value: If specified, defines the type of each tuple component in an
-// element produced by the resulting iterator.
-// If not specified, defaults to <>
-//
-// REQUIRES: len(value) >= 0
-func IteratorFromStringHandleOutputTypes(value []tf.DataType) IteratorFromStringHandleAttr {
-	return func(m optionalAttr) {
-		m["output_types"] = value
-	}
-}
-
-// IteratorFromStringHandleOutputShapes sets the optional output_shapes attribute to value.
-//
-// value: If specified, defines the shape of each tuple component in an
-// element produced by the resulting iterator.
-// If not specified, defaults to <>
-//
-// REQUIRES: len(value) >= 0
-func IteratorFromStringHandleOutputShapes(value []tf.Shape) IteratorFromStringHandleAttr {
-	return func(m optionalAttr) {
-		m["output_shapes"] = value
-	}
-}
-
-// Converts the given string representing a handle to an iterator to a resource.
-//
-// Arguments:
-//	string_handle: A string representation of the given handle.
-//
-// Returns A handle to an iterator resource.
-func IteratorFromStringHandle(scope *Scope, string_handle tf.Output, optional ...IteratorFromStringHandleAttr) (resource_handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "IteratorFromStringHandle",
-		Input: []tf.Input{
-			string_handle,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Add all input tensors element wise.
 //
 //   Inputs must be of same size and shape.
@@ -34540,6 +34908,49 @@ func SparseCrossHashed(scope *Scope, indices []tf.Output, values []tf.Output, sh
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
+// Conv3DBackpropInputAttr is an optional argument to Conv3DBackpropInput.
+type Conv3DBackpropInputAttr func(optionalAttr)
+
+// Conv3DBackpropInputDilations sets the optional dilations attribute to value.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 i:1 >
+func Conv3DBackpropInputDilations(value []int64) Conv3DBackpropInputAttr {
+	return func(m optionalAttr) {
+		m["dilations"] = value
+	}
+}
+
+// Computes the gradients of 3-D convolution with respect to the input.
+//
+// DEPRECATED at GraphDef version 10: Use Conv3DBackpropInputV2
+//
+// Arguments:
+//	input: Shape `[batch, depth, rows, cols, in_channels]`.
+//	filter: Shape `[depth, rows, cols, in_channels, out_channels]`.
+// `in_channels` must match between `input` and `filter`.
+//	out_backprop: Backprop signal of shape `[batch, out_depth, out_rows, out_cols,
+// out_channels]`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	padding: The type of padding algorithm to use.
+func Conv3DBackpropInput(scope *Scope, input tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv3DBackpropInputAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Conv3DBackpropInput",
+		Input: []tf.Input{
+			input, filter, out_backprop,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // QuantizedInstanceNormAttr is an optional argument to QuantizedInstanceNorm.
 type QuantizedInstanceNormAttr func(optionalAttr)
 
@@ -35985,222 +36396,6 @@ func DeserializeManySparse(scope *Scope, serialized_sparse tf.Output, dtype tf.D
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// MapStageAttr is an optional argument to MapStage.
-type MapStageAttr func(optionalAttr)
-
-// MapStageCapacity sets the optional capacity attribute to value.
-//
-// value: Maximum number of elements in the Staging Area. If > 0, inserts
-// on the container will block when the capacity is reached.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func MapStageCapacity(value int64) MapStageAttr {
-	return func(m optionalAttr) {
-		m["capacity"] = value
-	}
-}
-
-// MapStageMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func MapStageMemoryLimit(value int64) MapStageAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
-	}
-}
-
-// MapStageContainer sets the optional container attribute to value.
-//
-// value: If non-empty, this queue is placed in the given container. Otherwise,
-// a default container is used.
-// If not specified, defaults to ""
-func MapStageContainer(value string) MapStageAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// MapStageSharedName sets the optional shared_name attribute to value.
-//
-// value: It is necessary to match this name to the matching Unstage Op.
-// If not specified, defaults to ""
-func MapStageSharedName(value string) MapStageAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Stage (key, values) in the underlying container which behaves like a hashtable.
-//
-// Arguments:
-//	key: int64
-//
-//	values: a list of tensors
-// dtypes A list of data types that inserted values should adhere to.
-//
-//
-// Returns the created operation.
-func MapStage(scope *Scope, key tf.Output, indices tf.Output, values []tf.Output, dtypes []tf.DataType, optional ...MapStageAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "MapStage",
-		Input: []tf.Input{
-			key, indices, tf.OutputList(values),
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// FusedBatchNormGradV2Attr is an optional argument to FusedBatchNormGradV2.
-type FusedBatchNormGradV2Attr func(optionalAttr)
-
-// FusedBatchNormGradV2Epsilon sets the optional epsilon attribute to value.
-//
-// value: A small float number added to the variance of x.
-// If not specified, defaults to 0.0001
-func FusedBatchNormGradV2Epsilon(value float32) FusedBatchNormGradV2Attr {
-	return func(m optionalAttr) {
-		m["epsilon"] = value
-	}
-}
-
-// FusedBatchNormGradV2DataFormat sets the optional data_format attribute to value.
-//
-// value: The data format for y_backprop, x, x_backprop.
-// Either "NHWC" (default) or "NCHW".
-// If not specified, defaults to "NHWC"
-func FusedBatchNormGradV2DataFormat(value string) FusedBatchNormGradV2Attr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// FusedBatchNormGradV2IsTraining sets the optional is_training attribute to value.
-//
-// value: A bool value to indicate the operation is for training (default)
-// or inference.
-// If not specified, defaults to true
-func FusedBatchNormGradV2IsTraining(value bool) FusedBatchNormGradV2Attr {
-	return func(m optionalAttr) {
-		m["is_training"] = value
-	}
-}
-
-// Gradient for batch normalization.
-//
-// Note that the size of 4D Tensors are defined by either "NHWC" or "NCHW".
-// The size of 1D Tensors matches the dimension C of the 4D Tensors.
-//
-// Arguments:
-//	y_backprop: A 4D Tensor for the gradient with respect to y.
-//	x: A 4D Tensor for input data.
-//	scale: A 1D Tensor for scaling factor, to scale the normalized x.
-//	reserve_space_1: When is_training is True, a 1D Tensor for the computed batch
-// mean to be reused in gradient computation. When is_training is
-// False, a 1D Tensor for the population mean to be reused in both
-// 1st and 2nd order gradient computation.
-//	reserve_space_2: When is_training is True, a 1D Tensor for the computed batch
-// variance (inverted variance in the cuDNN case) to be reused in
-// gradient computation. When is_training is False, a 1D Tensor
-// for the population variance to be reused in both 1st and 2nd
-// order gradient computation.
-//
-// Returns:
-//	x_backprop: A 4D Tensor for the gradient with respect to x.
-//	scale_backprop: A 1D Tensor for the gradient with respect to scale.
-//	offset_backprop: A 1D Tensor for the gradient with respect to offset.
-//	reserve_space_3: Unused placeholder to match the mean input in FusedBatchNorm.
-//	reserve_space_4: Unused placeholder to match the variance input
-// in FusedBatchNorm.
-func FusedBatchNormGradV2(scope *Scope, y_backprop tf.Output, x tf.Output, scale tf.Output, reserve_space_1 tf.Output, reserve_space_2 tf.Output, optional ...FusedBatchNormGradV2Attr) (x_backprop tf.Output, scale_backprop tf.Output, offset_backprop tf.Output, reserve_space_3 tf.Output, reserve_space_4 tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "FusedBatchNormGradV2",
-		Input: []tf.Input{
-			y_backprop, x, scale, reserve_space_1, reserve_space_2,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
-}
-
-// SparseTensorDenseMatMulAttr is an optional argument to SparseTensorDenseMatMul.
-type SparseTensorDenseMatMulAttr func(optionalAttr)
-
-// SparseTensorDenseMatMulAdjointA sets the optional adjoint_a attribute to value.
-//
-// value: Use the adjoint of A in the matrix multiply.  If A is complex, this
-// is transpose(conj(A)).  Otherwise it's transpose(A).
-// If not specified, defaults to false
-func SparseTensorDenseMatMulAdjointA(value bool) SparseTensorDenseMatMulAttr {
-	return func(m optionalAttr) {
-		m["adjoint_a"] = value
-	}
-}
-
-// SparseTensorDenseMatMulAdjointB sets the optional adjoint_b attribute to value.
-//
-// value: Use the adjoint of B in the matrix multiply.  If B is complex, this
-// is transpose(conj(B)).  Otherwise it's transpose(B).
-// If not specified, defaults to false
-func SparseTensorDenseMatMulAdjointB(value bool) SparseTensorDenseMatMulAttr {
-	return func(m optionalAttr) {
-		m["adjoint_b"] = value
-	}
-}
-
-// Multiply SparseTensor (of rank 2) "A" by dense matrix "B".
-//
-// No validity checking is performed on the indices of A.  However, the following
-// input format is recommended for optimal behavior:
-//
-// if adjoint_a == false:
-//   A should be sorted in lexicographically increasing order.  Use SparseReorder
-//   if you're not sure.
-// if adjoint_a == true:
-//   A should be sorted in order of increasing dimension 1 (i.e., "column major"
-//   order instead of "row major" order).
-//
-// Arguments:
-//	a_indices: 2-D.  The `indices` of the `SparseTensor`, size `[nnz, 2]` Matrix.
-//	a_values: 1-D.  The `values` of the `SparseTensor`, size `[nnz]` Vector.
-//	a_shape: 1-D.  The `shape` of the `SparseTensor`, size `[2]` Vector.
-//	b: 2-D.  A dense Matrix.
-func SparseTensorDenseMatMul(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b tf.Output, optional ...SparseTensorDenseMatMulAttr) (product tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseTensorDenseMatMul",
-		Input: []tf.Input{
-			a_indices, a_values, a_shape, b,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Sets the index-th position of the list to contain the given tensor.
 //
 // input_handle: the list
@@ -36542,63 +36737,6 @@ func SparseAddGrad(scope *Scope, backprop_val_grad tf.Output, a_indices tf.Outpu
 	return op.Output(0), op.Output(1)
 }
 
-// Removes keys and its associated values from a table.
-//
-// The tensor `keys` must of the same type as the keys of the table. Keys not
-// already in the table are silently ignored.
-//
-// Arguments:
-//	table_handle: Handle to the table.
-//	keys: Any shape.  Keys of the elements to remove.
-//
-// Returns the created operation.
-func LookupTableRemoveV2(scope *Scope, table_handle tf.Output, keys tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "LookupTableRemoveV2",
-		Input: []tf.Input{
-			table_handle, keys,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
-// NotEqualAttr is an optional argument to NotEqual.
-type NotEqualAttr func(optionalAttr)
-
-// NotEqualIncompatibleShapeError sets the optional incompatible_shape_error attribute to value.
-// If not specified, defaults to true
-func NotEqualIncompatibleShapeError(value bool) NotEqualAttr {
-	return func(m optionalAttr) {
-		m["incompatible_shape_error"] = value
-	}
-}
-
-// Returns the truth value of (x != y) element-wise.
-//
-// *NOTE*: `NotEqual` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func NotEqual(scope *Scope, x tf.Output, y tf.Output, optional ...NotEqualAttr) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "NotEqual",
-		Input: []tf.Input{
-			x, y,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Computes the complementary error function of `x` element-wise.
 func Erfc(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
@@ -36926,235 +37064,6 @@ func SdcaFprint(scope *Scope, input tf.Output) (output tf.Output) {
 	return op.Output(0)
 }
 
-// ParseSequenceExampleAttr is an optional argument to ParseSequenceExample.
-type ParseSequenceExampleAttr func(optionalAttr)
-
-// ParseSequenceExampleNcontextSparse sets the optional Ncontext_sparse attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func ParseSequenceExampleNcontextSparse(value int64) ParseSequenceExampleAttr {
-	return func(m optionalAttr) {
-		m["Ncontext_sparse"] = value
-	}
-}
-
-// ParseSequenceExampleNcontextDense sets the optional Ncontext_dense attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func ParseSequenceExampleNcontextDense(value int64) ParseSequenceExampleAttr {
-	return func(m optionalAttr) {
-		m["Ncontext_dense"] = value
-	}
-}
-
-// ParseSequenceExampleNfeatureListSparse sets the optional Nfeature_list_sparse attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func ParseSequenceExampleNfeatureListSparse(value int64) ParseSequenceExampleAttr {
-	return func(m optionalAttr) {
-		m["Nfeature_list_sparse"] = value
-	}
-}
-
-// ParseSequenceExampleNfeatureListDense sets the optional Nfeature_list_dense attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func ParseSequenceExampleNfeatureListDense(value int64) ParseSequenceExampleAttr {
-	return func(m optionalAttr) {
-		m["Nfeature_list_dense"] = value
-	}
-}
-
-// ParseSequenceExampleContextSparseTypes sets the optional context_sparse_types attribute to value.
-//
-// value: A list of Ncontext_sparse types; the data types of data in
-// each context Feature given in context_sparse_keys.
-// Currently the ParseSingleSequenceExample supports DT_FLOAT (FloatList),
-// DT_INT64 (Int64List), and DT_STRING (BytesList).
-// If not specified, defaults to <>
-//
-// REQUIRES: len(value) >= 0
-func ParseSequenceExampleContextSparseTypes(value []tf.DataType) ParseSequenceExampleAttr {
-	return func(m optionalAttr) {
-		m["context_sparse_types"] = value
-	}
-}
-
-// ParseSequenceExampleFeatureListDenseTypes sets the optional feature_list_dense_types attribute to value.
-// If not specified, defaults to <>
-//
-// REQUIRES: len(value) >= 0
-func ParseSequenceExampleFeatureListDenseTypes(value []tf.DataType) ParseSequenceExampleAttr {
-	return func(m optionalAttr) {
-		m["feature_list_dense_types"] = value
-	}
-}
-
-// ParseSequenceExampleContextDenseShapes sets the optional context_dense_shapes attribute to value.
-//
-// value: A list of Ncontext_dense shapes; the shapes of data in
-// each context Feature given in context_dense_keys.
-// The number of elements in the Feature corresponding to context_dense_key[j]
-// must always equal context_dense_shapes[j].NumEntries().
-// The shape of context_dense_values[j] will match context_dense_shapes[j].
-// If not specified, defaults to <>
-//
-// REQUIRES: len(value) >= 0
-func ParseSequenceExampleContextDenseShapes(value []tf.Shape) ParseSequenceExampleAttr {
-	return func(m optionalAttr) {
-		m["context_dense_shapes"] = value
-	}
-}
-
-// ParseSequenceExampleFeatureListSparseTypes sets the optional feature_list_sparse_types attribute to value.
-//
-// value: A list of Nfeature_list_sparse types; the data types
-// of data in each FeatureList given in feature_list_sparse_keys.
-// Currently the ParseSingleSequenceExample supports DT_FLOAT (FloatList),
-// DT_INT64 (Int64List), and DT_STRING (BytesList).
-// If not specified, defaults to <>
-//
-// REQUIRES: len(value) >= 0
-func ParseSequenceExampleFeatureListSparseTypes(value []tf.DataType) ParseSequenceExampleAttr {
-	return func(m optionalAttr) {
-		m["feature_list_sparse_types"] = value
-	}
-}
-
-// ParseSequenceExampleFeatureListDenseShapes sets the optional feature_list_dense_shapes attribute to value.
-//
-// value: A list of Nfeature_list_dense shapes; the shapes of
-// data in each FeatureList given in feature_list_dense_keys.
-// The shape of each Feature in the FeatureList corresponding to
-// feature_list_dense_key[j] must always equal
-// feature_list_dense_shapes[j].NumEntries().
-// If not specified, defaults to <>
-//
-// REQUIRES: len(value) >= 0
-func ParseSequenceExampleFeatureListDenseShapes(value []tf.Shape) ParseSequenceExampleAttr {
-	return func(m optionalAttr) {
-		m["feature_list_dense_shapes"] = value
-	}
-}
-
-// Transforms a vector of brain.SequenceExample protos (as strings) into typed tensors.
-//
-// Arguments:
-//	serialized: A vector containing binary serialized SequenceExample protos.
-//	debug_name: A vector containing the names of the serialized protos.
-// May contain, for example, table key (descriptive) name for the
-// corresponding serialized proto.  This is purely useful for debugging
-// purposes, and the presence of values here has no effect on the output.
-// May also be an empty vector if no name is available.
-//	context_dense_defaults: A list of Ncontext_dense Tensors (some may be empty).
-// context_dense_defaults[j] provides default values
-// when the SequenceExample's context map lacks context_dense_key[j].
-// If an empty Tensor is provided for context_dense_defaults[j],
-// then the Feature context_dense_keys[j] is required.
-// The input type is inferred from context_dense_defaults[j], even when it's
-// empty.  If context_dense_defaults[j] is not empty, its shape must match
-// context_dense_shapes[j].
-//	feature_list_dense_missing_assumed_empty: A vector listing the
-// FeatureList keys which may be missing from the SequenceExamples.  If the
-// associated FeatureList is missing, it is treated as empty.  By default,
-// any FeatureList not listed in this vector must exist in the SequenceExamples.
-//	context_sparse_keys: A list of Ncontext_sparse string Tensors (scalars).
-// The keys expected in the Examples' features associated with context_sparse
-// values.
-//	context_dense_keys: A list of Ncontext_dense string Tensors (scalars).
-// The keys expected in the SequenceExamples' context features associated with
-// dense values.
-//	feature_list_sparse_keys: A list of Nfeature_list_sparse string Tensors
-// (scalars).  The keys expected in the FeatureLists associated with sparse
-// values.
-//	feature_list_dense_keys: A list of Nfeature_list_dense string Tensors (scalars).
-// The keys expected in the SequenceExamples' feature_lists associated
-// with lists of dense values.
-func ParseSequenceExample(scope *Scope, serialized tf.Output, debug_name tf.Output, context_dense_defaults []tf.Output, feature_list_dense_missing_assumed_empty []string, context_sparse_keys []string, context_dense_keys []string, feature_list_sparse_keys []string, feature_list_dense_keys []string, optional ...ParseSequenceExampleAttr) (context_sparse_indices []tf.Output, context_sparse_values []tf.Output, context_sparse_shapes []tf.Output, context_dense_values []tf.Output, feature_list_sparse_indices []tf.Output, feature_list_sparse_values []tf.Output, feature_list_sparse_shapes []tf.Output, feature_list_dense_values []tf.Output, feature_list_dense_lengths []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"feature_list_dense_missing_assumed_empty": feature_list_dense_missing_assumed_empty, "context_sparse_keys": context_sparse_keys, "context_dense_keys": context_dense_keys, "feature_list_sparse_keys": feature_list_sparse_keys, "feature_list_dense_keys": feature_list_dense_keys}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ParseSequenceExample",
-		Input: []tf.Input{
-			serialized, debug_name, tf.OutputList(context_dense_defaults),
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if context_sparse_indices, idx, err = makeOutputList(op, idx, "context_sparse_indices"); err != nil {
-		scope.UpdateErr("ParseSequenceExample", err)
-		return
-	}
-	if context_sparse_values, idx, err = makeOutputList(op, idx, "context_sparse_values"); err != nil {
-		scope.UpdateErr("ParseSequenceExample", err)
-		return
-	}
-	if context_sparse_shapes, idx, err = makeOutputList(op, idx, "context_sparse_shapes"); err != nil {
-		scope.UpdateErr("ParseSequenceExample", err)
-		return
-	}
-	if context_dense_values, idx, err = makeOutputList(op, idx, "context_dense_values"); err != nil {
-		scope.UpdateErr("ParseSequenceExample", err)
-		return
-	}
-	if feature_list_sparse_indices, idx, err = makeOutputList(op, idx, "feature_list_sparse_indices"); err != nil {
-		scope.UpdateErr("ParseSequenceExample", err)
-		return
-	}
-	if feature_list_sparse_values, idx, err = makeOutputList(op, idx, "feature_list_sparse_values"); err != nil {
-		scope.UpdateErr("ParseSequenceExample", err)
-		return
-	}
-	if feature_list_sparse_shapes, idx, err = makeOutputList(op, idx, "feature_list_sparse_shapes"); err != nil {
-		scope.UpdateErr("ParseSequenceExample", err)
-		return
-	}
-	if feature_list_dense_values, idx, err = makeOutputList(op, idx, "feature_list_dense_values"); err != nil {
-		scope.UpdateErr("ParseSequenceExample", err)
-		return
-	}
-	if feature_list_dense_lengths, idx, err = makeOutputList(op, idx, "feature_list_dense_lengths"); err != nil {
-		scope.UpdateErr("ParseSequenceExample", err)
-		return
-	}
-	return context_sparse_indices, context_sparse_values, context_sparse_shapes, context_dense_values, feature_list_sparse_indices, feature_list_sparse_values, feature_list_sparse_shapes, feature_list_dense_values, feature_list_dense_lengths
-}
-
-// Returns true if queue is closed.
-//
-// This operation returns true if the queue is closed and false if the queue
-// is open.
-//
-// Arguments:
-//	handle: The handle to a queue.
-func QueueIsClosedV2(scope *Scope, handle tf.Output) (is_closed tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "QueueIsClosedV2",
-		Input: []tf.Input{
-			handle,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Inverse 3D fast Fourier transform.
 //
 // Computes the inverse 3-dimensional discrete Fourier transform over the
@@ -37827,31 +37736,6 @@ func RFFT(scope *Scope, input tf.Output, fft_length tf.Output, optional ...RFFTA
 	return op.Output(0)
 }
 
-// JPEG encode input image with provided compression quality.
-//
-// `image` is a 3-D uint8 Tensor of shape `[height, width, channels]`.
-// `quality` is an int32 jpeg compression quality value between 0 and 100.
-//
-//
-// Arguments:
-//	images: Images to adjust.  At least 3-D.
-//	quality: An int quality to encode to.
-//
-// Returns 0-D. JPEG-encoded image.
-func EncodeJpegVariableQuality(scope *Scope, images tf.Output, quality tf.Output) (contents tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "EncodeJpegVariableQuality",
-		Input: []tf.Input{
-			images, quality,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // 3D fast Fourier transform.
 //
 // Computes the 3-dimensional discrete Fourier transform over the inner-most 3
@@ -38270,6 +38154,44 @@ func RecvTPUEmbeddingActivations(scope *Scope, num_outputs int64, config string)
 	return outputs
 }
 
+// InfeedEnqueuePrelinearizedBufferAttr is an optional argument to InfeedEnqueuePrelinearizedBuffer.
+type InfeedEnqueuePrelinearizedBufferAttr func(optionalAttr)
+
+// InfeedEnqueuePrelinearizedBufferDeviceOrdinal sets the optional device_ordinal attribute to value.
+//
+// value: The TPU device to use. This should be -1 when the Op is running on a TPU device
+// and = 0 when the Op is running on the CPU device.
+// If not specified, defaults to -1
+func InfeedEnqueuePrelinearizedBufferDeviceOrdinal(value int64) InfeedEnqueuePrelinearizedBufferAttr {
+	return func(m optionalAttr) {
+		m["device_ordinal"] = value
+	}
+}
+
+// An op which enqueues prelinearized buffer into TPU infeed.
+//
+// Arguments:
+//	input: A variant tensor representing linearized output.
+//
+// Returns the created operation.
+func InfeedEnqueuePrelinearizedBuffer(scope *Scope, input tf.Output, optional ...InfeedEnqueuePrelinearizedBufferAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "InfeedEnqueuePrelinearizedBuffer",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
 // Computes the derivative of a Gamma random sample w.r.t. `alpha`.
 func RandomGammaGrad(scope *Scope, alpha tf.Output, sample tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
@@ -38549,166 +38471,22 @@ func StatefulTruncatedNormal(scope *Scope, resource tf.Output, algorithm tf.Outp
 	return op.Output(0)
 }
 
-// Returns the rank of a tensor.
+// Returns true if queue is closed.
 //
-// This operation returns an integer representing the rank of `input`.
-//
-// For example:
-//
-// ```
-// # 't' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]
-// # shape of tensor 't' is [2, 2, 3]
-// rank(t) ==> 3
-// ```
-//
-// **Note**: The rank of a tensor is not the same as the rank of a matrix. The rank
-// of a tensor is the number of indices required to uniquely select each element
-// of the tensor. Rank is also known as "order", "degree", or "ndims."
-func Rank(scope *Scope, input tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Rank",
-		Input: []tf.Input{
-			input,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// EncodeJpegAttr is an optional argument to EncodeJpeg.
-type EncodeJpegAttr func(optionalAttr)
-
-// EncodeJpegFormat sets the optional format attribute to value.
-//
-// value: Per pixel image format.
-// If not specified, defaults to ""
-func EncodeJpegFormat(value string) EncodeJpegAttr {
-	return func(m optionalAttr) {
-		m["format"] = value
-	}
-}
-
-// EncodeJpegQuality sets the optional quality attribute to value.
-//
-// value: Quality of the compression from 0 to 100 (higher is better and slower).
-// If not specified, defaults to 95
-func EncodeJpegQuality(value int64) EncodeJpegAttr {
-	return func(m optionalAttr) {
-		m["quality"] = value
-	}
-}
-
-// EncodeJpegProgressive sets the optional progressive attribute to value.
-//
-// value: If True, create a JPEG that loads progressively (coarse to fine).
-// If not specified, defaults to false
-func EncodeJpegProgressive(value bool) EncodeJpegAttr {
-	return func(m optionalAttr) {
-		m["progressive"] = value
-	}
-}
-
-// EncodeJpegOptimizeSize sets the optional optimize_size attribute to value.
-//
-// value: If True, spend CPU/RAM to reduce size with no quality change.
-// If not specified, defaults to false
-func EncodeJpegOptimizeSize(value bool) EncodeJpegAttr {
-	return func(m optionalAttr) {
-		m["optimize_size"] = value
-	}
-}
-
-// EncodeJpegChromaDownsampling sets the optional chroma_downsampling attribute to value.
-//
-// value: See http://en.wikipedia.org/wiki/Chroma_subsampling.
-// If not specified, defaults to true
-func EncodeJpegChromaDownsampling(value bool) EncodeJpegAttr {
-	return func(m optionalAttr) {
-		m["chroma_downsampling"] = value
-	}
-}
-
-// EncodeJpegDensityUnit sets the optional density_unit attribute to value.
-//
-// value: Unit used to specify `x_density` and `y_density`:
-// pixels per inch (`'in'`) or centimeter (`'cm'`).
-// If not specified, defaults to "in"
-func EncodeJpegDensityUnit(value string) EncodeJpegAttr {
-	return func(m optionalAttr) {
-		m["density_unit"] = value
-	}
-}
-
-// EncodeJpegXDensity sets the optional x_density attribute to value.
-//
-// value: Horizontal pixels per density unit.
-// If not specified, defaults to 300
-func EncodeJpegXDensity(value int64) EncodeJpegAttr {
-	return func(m optionalAttr) {
-		m["x_density"] = value
-	}
-}
-
-// EncodeJpegYDensity sets the optional y_density attribute to value.
-//
-// value: Vertical pixels per density unit.
-// If not specified, defaults to 300
-func EncodeJpegYDensity(value int64) EncodeJpegAttr {
-	return func(m optionalAttr) {
-		m["y_density"] = value
-	}
-}
-
-// EncodeJpegXmpMetadata sets the optional xmp_metadata attribute to value.
-//
-// value: If not empty, embed this XMP metadata in the image header.
-// If not specified, defaults to ""
-func EncodeJpegXmpMetadata(value string) EncodeJpegAttr {
-	return func(m optionalAttr) {
-		m["xmp_metadata"] = value
-	}
-}
-
-// JPEG-encode an image.
-//
-// `image` is a 3-D uint8 Tensor of shape `[height, width, channels]`.
-//
-// The attr `format` can be used to override the color format of the encoded
-// output.  Values can be:
-//
-// *   `''`: Use a default format based on the number of channels in the image.
-// *   `grayscale`: Output a grayscale JPEG image.  The `channels` dimension
-//     of `image` must be 1.
-// *   `rgb`: Output an RGB JPEG image. The `channels` dimension
-//     of `image` must be 3.
-//
-// If `format` is not specified or is the empty string, a default format is picked
-// in function of the number of channels in `image`:
-//
-// *   1: Output a grayscale image.
-// *   3: Output an RGB image.
+// This operation returns true if the queue is closed and false if the queue
+// is open.
 //
 // Arguments:
-//	image: 3-D with shape `[height, width, channels]`.
-//
-// Returns 0-D. JPEG-encoded image.
-func EncodeJpeg(scope *Scope, image tf.Output, optional ...EncodeJpegAttr) (contents tf.Output) {
+//	handle: The handle to a queue.
+func QueueIsClosedV2(scope *Scope, handle tf.Output) (is_closed tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "EncodeJpeg",
+		Type: "QueueIsClosedV2",
 		Input: []tf.Input{
-			image,
+			handle,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
@@ -38736,6 +38514,214 @@ func IsBoostedTreesQuantileStreamResourceInitialized(scope *Scope, quantile_stre
 	return op.Output(0)
 }
 
+// ParseSequenceExampleAttr is an optional argument to ParseSequenceExample.
+type ParseSequenceExampleAttr func(optionalAttr)
+
+// ParseSequenceExampleNcontextSparse sets the optional Ncontext_sparse attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func ParseSequenceExampleNcontextSparse(value int64) ParseSequenceExampleAttr {
+	return func(m optionalAttr) {
+		m["Ncontext_sparse"] = value
+	}
+}
+
+// ParseSequenceExampleNcontextDense sets the optional Ncontext_dense attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func ParseSequenceExampleNcontextDense(value int64) ParseSequenceExampleAttr {
+	return func(m optionalAttr) {
+		m["Ncontext_dense"] = value
+	}
+}
+
+// ParseSequenceExampleNfeatureListSparse sets the optional Nfeature_list_sparse attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func ParseSequenceExampleNfeatureListSparse(value int64) ParseSequenceExampleAttr {
+	return func(m optionalAttr) {
+		m["Nfeature_list_sparse"] = value
+	}
+}
+
+// ParseSequenceExampleNfeatureListDense sets the optional Nfeature_list_dense attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func ParseSequenceExampleNfeatureListDense(value int64) ParseSequenceExampleAttr {
+	return func(m optionalAttr) {
+		m["Nfeature_list_dense"] = value
+	}
+}
+
+// ParseSequenceExampleContextSparseTypes sets the optional context_sparse_types attribute to value.
+//
+// value: A list of Ncontext_sparse types; the data types of data in
+// each context Feature given in context_sparse_keys.
+// Currently the ParseSingleSequenceExample supports DT_FLOAT (FloatList),
+// DT_INT64 (Int64List), and DT_STRING (BytesList).
+// If not specified, defaults to <>
+//
+// REQUIRES: len(value) >= 0
+func ParseSequenceExampleContextSparseTypes(value []tf.DataType) ParseSequenceExampleAttr {
+	return func(m optionalAttr) {
+		m["context_sparse_types"] = value
+	}
+}
+
+// ParseSequenceExampleFeatureListDenseTypes sets the optional feature_list_dense_types attribute to value.
+// If not specified, defaults to <>
+//
+// REQUIRES: len(value) >= 0
+func ParseSequenceExampleFeatureListDenseTypes(value []tf.DataType) ParseSequenceExampleAttr {
+	return func(m optionalAttr) {
+		m["feature_list_dense_types"] = value
+	}
+}
+
+// ParseSequenceExampleContextDenseShapes sets the optional context_dense_shapes attribute to value.
+//
+// value: A list of Ncontext_dense shapes; the shapes of data in
+// each context Feature given in context_dense_keys.
+// The number of elements in the Feature corresponding to context_dense_key[j]
+// must always equal context_dense_shapes[j].NumEntries().
+// The shape of context_dense_values[j] will match context_dense_shapes[j].
+// If not specified, defaults to <>
+//
+// REQUIRES: len(value) >= 0
+func ParseSequenceExampleContextDenseShapes(value []tf.Shape) ParseSequenceExampleAttr {
+	return func(m optionalAttr) {
+		m["context_dense_shapes"] = value
+	}
+}
+
+// ParseSequenceExampleFeatureListSparseTypes sets the optional feature_list_sparse_types attribute to value.
+//
+// value: A list of Nfeature_list_sparse types; the data types
+// of data in each FeatureList given in feature_list_sparse_keys.
+// Currently the ParseSingleSequenceExample supports DT_FLOAT (FloatList),
+// DT_INT64 (Int64List), and DT_STRING (BytesList).
+// If not specified, defaults to <>
+//
+// REQUIRES: len(value) >= 0
+func ParseSequenceExampleFeatureListSparseTypes(value []tf.DataType) ParseSequenceExampleAttr {
+	return func(m optionalAttr) {
+		m["feature_list_sparse_types"] = value
+	}
+}
+
+// ParseSequenceExampleFeatureListDenseShapes sets the optional feature_list_dense_shapes attribute to value.
+//
+// value: A list of Nfeature_list_dense shapes; the shapes of
+// data in each FeatureList given in feature_list_dense_keys.
+// The shape of each Feature in the FeatureList corresponding to
+// feature_list_dense_key[j] must always equal
+// feature_list_dense_shapes[j].NumEntries().
+// If not specified, defaults to <>
+//
+// REQUIRES: len(value) >= 0
+func ParseSequenceExampleFeatureListDenseShapes(value []tf.Shape) ParseSequenceExampleAttr {
+	return func(m optionalAttr) {
+		m["feature_list_dense_shapes"] = value
+	}
+}
+
+// Transforms a vector of brain.SequenceExample protos (as strings) into typed tensors.
+//
+// Arguments:
+//	serialized: A vector containing binary serialized SequenceExample protos.
+//	debug_name: A vector containing the names of the serialized protos.
+// May contain, for example, table key (descriptive) name for the
+// corresponding serialized proto.  This is purely useful for debugging
+// purposes, and the presence of values here has no effect on the output.
+// May also be an empty vector if no name is available.
+//	context_dense_defaults: A list of Ncontext_dense Tensors (some may be empty).
+// context_dense_defaults[j] provides default values
+// when the SequenceExample's context map lacks context_dense_key[j].
+// If an empty Tensor is provided for context_dense_defaults[j],
+// then the Feature context_dense_keys[j] is required.
+// The input type is inferred from context_dense_defaults[j], even when it's
+// empty.  If context_dense_defaults[j] is not empty, its shape must match
+// context_dense_shapes[j].
+//	feature_list_dense_missing_assumed_empty: A vector listing the
+// FeatureList keys which may be missing from the SequenceExamples.  If the
+// associated FeatureList is missing, it is treated as empty.  By default,
+// any FeatureList not listed in this vector must exist in the SequenceExamples.
+//	context_sparse_keys: A list of Ncontext_sparse string Tensors (scalars).
+// The keys expected in the Examples' features associated with context_sparse
+// values.
+//	context_dense_keys: A list of Ncontext_dense string Tensors (scalars).
+// The keys expected in the SequenceExamples' context features associated with
+// dense values.
+//	feature_list_sparse_keys: A list of Nfeature_list_sparse string Tensors
+// (scalars).  The keys expected in the FeatureLists associated with sparse
+// values.
+//	feature_list_dense_keys: A list of Nfeature_list_dense string Tensors (scalars).
+// The keys expected in the SequenceExamples' feature_lists associated
+// with lists of dense values.
+func ParseSequenceExample(scope *Scope, serialized tf.Output, debug_name tf.Output, context_dense_defaults []tf.Output, feature_list_dense_missing_assumed_empty []string, context_sparse_keys []string, context_dense_keys []string, feature_list_sparse_keys []string, feature_list_dense_keys []string, optional ...ParseSequenceExampleAttr) (context_sparse_indices []tf.Output, context_sparse_values []tf.Output, context_sparse_shapes []tf.Output, context_dense_values []tf.Output, feature_list_sparse_indices []tf.Output, feature_list_sparse_values []tf.Output, feature_list_sparse_shapes []tf.Output, feature_list_dense_values []tf.Output, feature_list_dense_lengths []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"feature_list_dense_missing_assumed_empty": feature_list_dense_missing_assumed_empty, "context_sparse_keys": context_sparse_keys, "context_dense_keys": context_dense_keys, "feature_list_sparse_keys": feature_list_sparse_keys, "feature_list_dense_keys": feature_list_dense_keys}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ParseSequenceExample",
+		Input: []tf.Input{
+			serialized, debug_name, tf.OutputList(context_dense_defaults),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if context_sparse_indices, idx, err = makeOutputList(op, idx, "context_sparse_indices"); err != nil {
+		scope.UpdateErr("ParseSequenceExample", err)
+		return
+	}
+	if context_sparse_values, idx, err = makeOutputList(op, idx, "context_sparse_values"); err != nil {
+		scope.UpdateErr("ParseSequenceExample", err)
+		return
+	}
+	if context_sparse_shapes, idx, err = makeOutputList(op, idx, "context_sparse_shapes"); err != nil {
+		scope.UpdateErr("ParseSequenceExample", err)
+		return
+	}
+	if context_dense_values, idx, err = makeOutputList(op, idx, "context_dense_values"); err != nil {
+		scope.UpdateErr("ParseSequenceExample", err)
+		return
+	}
+	if feature_list_sparse_indices, idx, err = makeOutputList(op, idx, "feature_list_sparse_indices"); err != nil {
+		scope.UpdateErr("ParseSequenceExample", err)
+		return
+	}
+	if feature_list_sparse_values, idx, err = makeOutputList(op, idx, "feature_list_sparse_values"); err != nil {
+		scope.UpdateErr("ParseSequenceExample", err)
+		return
+	}
+	if feature_list_sparse_shapes, idx, err = makeOutputList(op, idx, "feature_list_sparse_shapes"); err != nil {
+		scope.UpdateErr("ParseSequenceExample", err)
+		return
+	}
+	if feature_list_dense_values, idx, err = makeOutputList(op, idx, "feature_list_dense_values"); err != nil {
+		scope.UpdateErr("ParseSequenceExample", err)
+		return
+	}
+	if feature_list_dense_lengths, idx, err = makeOutputList(op, idx, "feature_list_dense_lengths"); err != nil {
+		scope.UpdateErr("ParseSequenceExample", err)
+		return
+	}
+	return context_sparse_indices, context_sparse_values, context_sparse_shapes, context_dense_values, feature_list_sparse_indices, feature_list_sparse_values, feature_list_sparse_shapes, feature_list_dense_values, feature_list_dense_lengths
+}
+
 // Fast Fourier transform.
 //
 // Computes the 1-dimensional discrete Fourier transform over the inner-most
@@ -41652,67 +41638,38 @@ func ResourceApplyAdam(scope *Scope, var_ tf.Output, m tf.Output, v tf.Output, b
 	return scope.AddOperation(opspec)
 }
 
-// CumsumAttr is an optional argument to Cumsum.
-type CumsumAttr func(optionalAttr)
+// ResizeNearestNeighborAttr is an optional argument to ResizeNearestNeighbor.
+type ResizeNearestNeighborAttr func(optionalAttr)
 
-// CumsumExclusive sets the optional exclusive attribute to value.
+// ResizeNearestNeighborAlignCorners sets the optional align_corners attribute to value.
 //
-// value: If `True`, perform exclusive cumsum.
+// value: If true, the centers of the 4 corner pixels of the input and output tensors are
+// aligned, preserving the values at the corner pixels. Defaults to false.
 // If not specified, defaults to false
-func CumsumExclusive(value bool) CumsumAttr {
+func ResizeNearestNeighborAlignCorners(value bool) ResizeNearestNeighborAttr {
 	return func(m optionalAttr) {
-		m["exclusive"] = value
+		m["align_corners"] = value
 	}
 }
 
-// CumsumReverse sets the optional reverse attribute to value.
-//
-// value: A `bool` (default: False).
+// ResizeNearestNeighborHalfPixelCenters sets the optional half_pixel_centers attribute to value.
 // If not specified, defaults to false
-func CumsumReverse(value bool) CumsumAttr {
+func ResizeNearestNeighborHalfPixelCenters(value bool) ResizeNearestNeighborAttr {
 	return func(m optionalAttr) {
-		m["reverse"] = value
+		m["half_pixel_centers"] = value
 	}
 }
 
-// Compute the cumulative sum of the tensor `x` along `axis`.
-//
-// By default, this op performs an inclusive cumsum, which means that the first
-// element of the input is identical to the first element of the output:
-//
-// ```python
-// tf.cumsum([a, b, c])  # => [a, a + b, a + b + c]
-// ```
-//
-// By setting the `exclusive` kwarg to `True`, an exclusive cumsum is
-// performed instead:
-//
-// ```python
-// tf.cumsum([a, b, c], exclusive=True)  # => [0, a, a + b]
-// ```
-//
-// By setting the `reverse` kwarg to `True`, the cumsum is performed in the
-// opposite direction:
-//
-// ```python
-// tf.cumsum([a, b, c], reverse=True)  # => [a + b + c, b + c, c]
-// ```
-//
-// This is more efficient than using separate `tf.reverse` ops.
-//
-// The `reverse` and `exclusive` kwargs can also be combined:
-//
-// ```python
-// tf.cumsum([a, b, c], exclusive=True, reverse=True)  # => [b + c, c, 0]
-// ```
+// Resize `images` to `size` using nearest neighbor interpolation.
 //
 // Arguments:
-//	x: A `Tensor`. Must be one of the following types: `float32`, `float64`,
-// `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`,
-// `complex128`, `qint8`, `quint8`, `qint32`, `half`.
-//	axis: A `Tensor` of type `int32` (default: 0). Must be in the range
-// `[-rank(x), rank(x))`.
-func Cumsum(scope *Scope, x tf.Output, axis tf.Output, optional ...CumsumAttr) (out tf.Output) {
+//	images: 4-D with shape `[batch, height, width, channels]`.
+//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
+// new size for the images.
+//
+// Returns 4-D with shape
+// `[batch, new_height, new_width, channels]`.
+func ResizeNearestNeighbor(scope *Scope, images tf.Output, size tf.Output, optional ...ResizeNearestNeighborAttr) (resized_images tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -41721,9 +41678,9 @@ func Cumsum(scope *Scope, x tf.Output, axis tf.Output, optional ...CumsumAttr) (
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Cumsum",
+		Type: "ResizeNearestNeighbor",
 		Input: []tf.Input{
-			x, axis,
+			images, size,
 		},
 		Attrs: attrs,
 	}
@@ -41731,37 +41688,63 @@ func Cumsum(scope *Scope, x tf.Output, axis tf.Output, optional ...CumsumAttr) (
 	return op.Output(0)
 }
 
-// Performs gradient updates of embedding tables.
+// A placeholder op for a value that will be fed into the computation.
 //
 // Arguments:
-//	inputs: A TensorList of gradients with which to update embedding tables.
-// This argument has the same length and shapes as the return value of
-// RecvTPUEmbeddingActivations, but contains gradients of the model's loss
-// with respect to the embedding activations. The embedding tables are updated
-// from these gradients via the optimizer specified in the TPU embedding
-// configuration given to tpu.initialize_system.
-//	learning_rates: A TensorList of float32 scalars, one for each dynamic learning
-// rate tag: see the comments in
-// //third_party/tensorflow/core/protobuf/tpu/optimization_parameters.proto.
-// Multiple tables can share the same dynamic learning rate tag as specified
-// in the configuration. If the learning rates for all tables are constant,
-// this list should be empty.
-//	config: Serialized TPUEmbeddingConfiguration proto.
+//	dtype: The type of elements in the tensor.
+//	shape: The shape of the tensor.
 //
-// Returns the created operation.
-func SendTPUEmbeddingGradients(scope *Scope, inputs []tf.Output, learning_rates []tf.Output, config string) (o *tf.Operation) {
+// Returns A tensor that will be provided using the infeed mechanism.
+func InfeedDequeue(scope *Scope, dtype tf.DataType, shape tf.Shape) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"config": config}
+	attrs := map[string]interface{}{"dtype": dtype, "shape": shape}
 	opspec := tf.OpSpec{
-		Type: "SendTPUEmbeddingGradients",
+		Type: "InfeedDequeue",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Encodes a `RaggedTensor` into a `variant` Tensor.
+//
+//
+// Encodes the given `RaggedTensor` and returns a `variant` Tensor. If
+// `batched_input` is True, then input `RaggedTensor` is unbatched along the
+// zero-th dimension, each component `RaggedTensor` is encoded into a scalar
+// `variant` Tensor, and these are stacked to return a 1-D `variant` Tensor.
+// If `batched_input` is False, then the input `RaggedTensor` is encoded as is and
+// a scalar `variant` Tensor is returned. A `RaggedTensor` is encoded by first
+// creating a 1-D `variant` Tensor with `ragged_rank + 1` elements, containing the
+// splits and values Tensors of the `RaggedTensor`. Then the 1-D `variant` Tensor
+// is wrapped in a scalar `variant` Tensor. See `RaggedTensorFromVariant` for the
+// corresponding decoding logic.
+//
+//
+// Arguments:
+//	rt_nested_splits: A list of one or more Tensors representing the splits of the input
+// `RaggedTensor`.
+//	rt_dense_values: A Tensor representing the values of the input `RaggedTensor`.
+//	batched_input: A `bool` denoting whether the input is a batched `RaggedTensor`.
+//
+// Returns A `variant` Tensor that containing encoded `RaggedTensor`.
+func RaggedTensorToVariant(scope *Scope, rt_nested_splits []tf.Output, rt_dense_values tf.Output, batched_input bool) (encoded_ragged tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"batched_input": batched_input}
+	opspec := tf.OpSpec{
+		Type: "RaggedTensorToVariant",
 		Input: []tf.Input{
-			tf.OutputList(inputs), tf.OutputList(learning_rates),
+			tf.OutputList(rt_nested_splits), rt_dense_values,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
 // ResourceApplyKerasMomentumAttr is an optional argument to ResourceApplyKerasMomentum.
@@ -42316,31 +42299,6 @@ func LoadTPUEmbeddingRMSPropParametersGradAccumDebug(scope *Scope, parameters tf
 	return scope.AddOperation(opspec)
 }
 
-// Concatenates tensors along one dimension.
-//
-// Arguments:
-//	values: List of `N` Tensors to concatenate. Their ranks and types must match,
-// and their sizes must match in all dimensions except `concat_dim`.
-//	axis: 0-D.  The dimension along which to concatenate.  Must be in the
-// range [-rank(values), rank(values)).
-//
-// Returns A `Tensor` with the concatenation of values stacked along the
-// `concat_dim` dimension.  This tensor's shape matches that of `values` except
-// in `concat_dim` where it has the sum of the sizes.
-func ConcatV2(scope *Scope, values []tf.Output, axis tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ConcatV2",
-		Input: []tf.Input{
-			tf.OutputList(values), axis,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // LoadTPUEmbeddingFTRLParametersGradAccumDebugAttr is an optional argument to LoadTPUEmbeddingFTRLParametersGradAccumDebug.
 type LoadTPUEmbeddingFTRLParametersGradAccumDebugAttr func(optionalAttr)
 
@@ -43001,6 +42959,34 @@ func SparseSoftmax(scope *Scope, sp_indices tf.Output, sp_values tf.Output, sp_s
 	return op.Output(0)
 }
 
+// Adjust the hue of one or more images.
+//
+// `images` is a tensor of at least 3 dimensions.  The last dimension is
+// interpreted as channels, and must be three.
+//
+// The input image is considered in the RGB colorspace. Conceptually, the RGB
+// colors are first mapped into HSV. A delta is then applied all the hue values,
+// and then remapped back to RGB colorspace.
+//
+// Arguments:
+//	images: Images to adjust.  At least 3-D.
+//	delta: A float delta to add to the hue.
+//
+// Returns The hue-adjusted image or images.
+func AdjustHue(scope *Scope, images tf.Output, delta tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "AdjustHue",
+		Input: []tf.Input{
+			images, delta,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Computes hyperbolic cosine of x element-wise.
 //
 //   Given an input tensor, this function computes hyperbolic cosine of every
@@ -43204,26 +43190,6 @@ func SparseSparseMaximum(scope *Scope, a_indices tf.Output, a_values tf.Output,
 	return op.Output(0), op.Output(1)
 }
 
-// Computes the Bessel i1e function of `x` element-wise.
-//
-// Exponentially scaled modified Bessel function of order 0 defined as
-// `bessel_i1e(x) = exp(-abs(x)) bessel_i1(x)`.
-//
-// This function is faster and numerically stabler than `bessel_i1(x)`.
-func BesselI1e(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "BesselI1e",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // LSTMBlockCellAttr is an optional argument to LSTMBlockCell.
 type LSTMBlockCellAttr func(optionalAttr)
 
@@ -44980,83 +44946,6 @@ func ResourceScatterNdAdd(scope *Scope, ref tf.Output, indices tf.Output, update
 	return scope.AddOperation(opspec)
 }
 
-// Counts the number of occurrences of each value in an integer array.
-//
-// Outputs a vector with length `size` and the same dtype as `weights`. If
-// `weights` are empty, then index `i` stores the number of times the value `i` is
-// counted in `arr`. If `weights` are non-empty, then index `i` stores the sum of
-// the value in `weights` at each index where the corresponding value in `arr` is
-// `i`.
-//
-// Values in `arr` outside of the range [0, size) are ignored.
-//
-// Arguments:
-//	arr: int32 `Tensor`.
-//	size: non-negative int32 scalar `Tensor`.
-//	weights: is an int32, int64, float32, or float64 `Tensor` with the same
-// shape as `arr`, or a length-0 `Tensor`, in which case it acts as all weights
-// equal to 1.
-//
-// Returns 1D `Tensor` with length equal to `size`. The counts or summed weights for
-// each value in the range [0, size).
-func Bincount(scope *Scope, arr tf.Output, size tf.Output, weights tf.Output) (bins tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Bincount",
-		Input: []tf.Input{
-			arr, size, weights,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Gradients for batch normalization.
-//
-// DEPRECATED at GraphDef version 9: Use tf.nn.batch_normalization()
-//
-// This op is deprecated. See `tf.nn.batch_normalization`.
-//
-// Arguments:
-//	t: A 4D input Tensor.
-//	m: A 1D mean Tensor with size matching the last dimension of t.
-// This is the first output from tf.nn.moments,
-// or a saved moving average thereof.
-//	v: A 1D variance Tensor with size matching the last dimension of t.
-// This is the second output from tf.nn.moments,
-// or a saved moving average thereof.
-//	gamma: A 1D gamma Tensor with size matching the last dimension of t.
-// If "scale_after_normalization" is true, this Tensor will be multiplied
-// with the normalized Tensor.
-//	backprop: 4D backprop Tensor.
-//	variance_epsilon: A small float number to avoid dividing by 0.
-//	scale_after_normalization: A bool indicating whether the resulted tensor
-// needs to be multiplied with gamma.
-//
-// Returns:
-//	dx: 4D backprop tensor for input.
-//	dm: 1D backprop tensor for mean.
-//	dv: 1D backprop tensor for variance.
-//	db: 1D backprop tensor for beta.
-//	dg: 1D backprop tensor for gamma.
-func BatchNormWithGlobalNormalizationGrad(scope *Scope, t tf.Output, m tf.Output, v tf.Output, gamma tf.Output, backprop tf.Output, variance_epsilon float32, scale_after_normalization bool) (dx tf.Output, dm tf.Output, dv tf.Output, db tf.Output, dg tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"variance_epsilon": variance_epsilon, "scale_after_normalization": scale_after_normalization}
-	opspec := tf.OpSpec{
-		Type: "BatchNormWithGlobalNormalizationGrad",
-		Input: []tf.Input{
-			t, m, v, gamma, backprop,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
-}
-
 // CropAndResizeGradImageAttr is an optional argument to CropAndResizeGradImage.
 type CropAndResizeGradImageAttr func(optionalAttr)
 
@@ -45355,141 +45244,6 @@ func OutfeedEnqueueTuple(scope *Scope, inputs []tf.Output) (o *tf.Operation) {
 	return scope.AddOperation(opspec)
 }
 
-// Returns the number of nonzeroes of `sparse_matrix`.
-//
-// Arguments:
-//	sparse_matrix: A CSRSparseMatrix.
-//
-// Returns The number of nonzeroes of `sparse_matrix`.
-func SparseMatrixNNZ(scope *Scope, sparse_matrix tf.Output) (nnz tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseMatrixNNZ",
-		Input: []tf.Input{
-			sparse_matrix,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// DecodeJpegAttr is an optional argument to DecodeJpeg.
-type DecodeJpegAttr func(optionalAttr)
-
-// DecodeJpegChannels sets the optional channels attribute to value.
-//
-// value: Number of color channels for the decoded image.
-// If not specified, defaults to 0
-func DecodeJpegChannels(value int64) DecodeJpegAttr {
-	return func(m optionalAttr) {
-		m["channels"] = value
-	}
-}
-
-// DecodeJpegRatio sets the optional ratio attribute to value.
-//
-// value: Downscaling ratio.
-// If not specified, defaults to 1
-func DecodeJpegRatio(value int64) DecodeJpegAttr {
-	return func(m optionalAttr) {
-		m["ratio"] = value
-	}
-}
-
-// DecodeJpegFancyUpscaling sets the optional fancy_upscaling attribute to value.
-//
-// value: If true use a slower but nicer upscaling of the
-// chroma planes (yuv420/422 only).
-// If not specified, defaults to true
-func DecodeJpegFancyUpscaling(value bool) DecodeJpegAttr {
-	return func(m optionalAttr) {
-		m["fancy_upscaling"] = value
-	}
-}
-
-// DecodeJpegTryRecoverTruncated sets the optional try_recover_truncated attribute to value.
-//
-// value: If true try to recover an image from truncated input.
-// If not specified, defaults to false
-func DecodeJpegTryRecoverTruncated(value bool) DecodeJpegAttr {
-	return func(m optionalAttr) {
-		m["try_recover_truncated"] = value
-	}
-}
-
-// DecodeJpegAcceptableFraction sets the optional acceptable_fraction attribute to value.
-//
-// value: The minimum required fraction of lines before a truncated
-// input is accepted.
-// If not specified, defaults to 1
-func DecodeJpegAcceptableFraction(value float32) DecodeJpegAttr {
-	return func(m optionalAttr) {
-		m["acceptable_fraction"] = value
-	}
-}
-
-// DecodeJpegDctMethod sets the optional dct_method attribute to value.
-//
-// value: string specifying a hint about the algorithm used for
-// decompression.  Defaults to "" which maps to a system-specific
-// default.  Currently valid values are ["INTEGER_FAST",
-// "INTEGER_ACCURATE"].  The hint may be ignored (e.g., the internal
-// jpeg library changes to a version that does not have that specific
-// option.)
-// If not specified, defaults to ""
-func DecodeJpegDctMethod(value string) DecodeJpegAttr {
-	return func(m optionalAttr) {
-		m["dct_method"] = value
-	}
-}
-
-// Decode a JPEG-encoded image to a uint8 tensor.
-//
-// The attr `channels` indicates the desired number of color channels for the
-// decoded image.
-//
-// Accepted values are:
-//
-// *   0: Use the number of channels in the JPEG-encoded image.
-// *   1: output a grayscale image.
-// *   3: output an RGB image.
-//
-// If needed, the JPEG-encoded image is transformed to match the requested number
-// of color channels.
-//
-// The attr `ratio` allows downscaling the image by an integer factor during
-// decoding.  Allowed values are: 1, 2, 4, and 8.  This is much faster than
-// downscaling the image later.
-//
-//
-// This op also supports decoding PNGs and non-animated GIFs since the interface is
-// the same, though it is cleaner to use `tf.io.decode_image`.
-//
-// Arguments:
-//	contents: 0-D.  The JPEG-encoded image.
-//
-// Returns 3-D with shape `[height, width, channels]`..
-func DecodeJpeg(scope *Scope, contents tf.Output, optional ...DecodeJpegAttr) (image tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "DecodeJpeg",
-		Input: []tf.Input{
-			contents,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // LoadTPUEmbeddingADAMParametersAttr is an optional argument to LoadTPUEmbeddingADAMParameters.
 type LoadTPUEmbeddingADAMParametersAttr func(optionalAttr)
 
@@ -46398,6 +46152,165 @@ func LoadTPUEmbeddingStochasticGradientDescentParametersGradAccumDebug(scope *Sc
 	return scope.AddOperation(opspec)
 }
 
+// FakeQuantWithMinMaxArgsAttr is an optional argument to FakeQuantWithMinMaxArgs.
+type FakeQuantWithMinMaxArgsAttr func(optionalAttr)
+
+// FakeQuantWithMinMaxArgsMin sets the optional min attribute to value.
+// If not specified, defaults to -6
+func FakeQuantWithMinMaxArgsMin(value float32) FakeQuantWithMinMaxArgsAttr {
+	return func(m optionalAttr) {
+		m["min"] = value
+	}
+}
+
+// FakeQuantWithMinMaxArgsMax sets the optional max attribute to value.
+// If not specified, defaults to 6
+func FakeQuantWithMinMaxArgsMax(value float32) FakeQuantWithMinMaxArgsAttr {
+	return func(m optionalAttr) {
+		m["max"] = value
+	}
+}
+
+// FakeQuantWithMinMaxArgsNumBits sets the optional num_bits attribute to value.
+// If not specified, defaults to 8
+func FakeQuantWithMinMaxArgsNumBits(value int64) FakeQuantWithMinMaxArgsAttr {
+	return func(m optionalAttr) {
+		m["num_bits"] = value
+	}
+}
+
+// FakeQuantWithMinMaxArgsNarrowRange sets the optional narrow_range attribute to value.
+// If not specified, defaults to false
+func FakeQuantWithMinMaxArgsNarrowRange(value bool) FakeQuantWithMinMaxArgsAttr {
+	return func(m optionalAttr) {
+		m["narrow_range"] = value
+	}
+}
+
+// Fake-quantize the 'inputs' tensor, type float to 'outputs' tensor of same type.
+//
+// Attributes
+//
+// *   `[min; max]` define the clamping range for the `inputs` data.
+// *   `inputs` values are quantized into the quantization range (
+// `[0; 2^num_bits - 1]` when `narrow_range` is false and `[1; 2^num_bits - 1]`
+// when it is true) and then de-quantized and output as floats in `[min; max]`
+// interval.
+// *   `num_bits` is the bitwidth of the quantization; between 2 and 16, inclusive.
+//
+// Before quantization, `min` and `max` values are adjusted with the following
+// logic.
+// It is suggested to have `min <= 0 <= max`. If `0` is not in the range of values,
+// the behavior can be unexpected:
+//
+// *   If `0 < min < max`: `min_adj = 0` and `max_adj = max - min`.
+// *   If `min < max < 0`: `min_adj = min - max` and `max_adj = 0`.
+// *   If `min <= 0 <= max`: `scale = (max - min) / (2^num_bits - 1) `,
+// `min_adj = scale * round(min / scale)` and `max_adj = max + min_adj - min`.
+//
+// Quantization is called fake since the output is still in floating point.
+func FakeQuantWithMinMaxArgs(scope *Scope, inputs tf.Output, optional ...FakeQuantWithMinMaxArgsAttr) (outputs tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "FakeQuantWithMinMaxArgs",
+		Input: []tf.Input{
+			inputs,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Batch normalization.
+//
+// DEPRECATED at GraphDef version 9: Use tf.nn.batch_normalization()
+//
+// This op is deprecated. Prefer `tf.nn.batch_normalization`.
+//
+// Arguments:
+//	t: A 4D input Tensor.
+//	m: A 1D mean Tensor with size matching the last dimension of t.
+// This is the first output from tf.nn.moments,
+// or a saved moving average thereof.
+//	v: A 1D variance Tensor with size matching the last dimension of t.
+// This is the second output from tf.nn.moments,
+// or a saved moving average thereof.
+//	beta: A 1D beta Tensor with size matching the last dimension of t.
+// An offset to be added to the normalized tensor.
+//	gamma: A 1D gamma Tensor with size matching the last dimension of t.
+// If "scale_after_normalization" is true, this tensor will be multiplied
+// with the normalized tensor.
+//	variance_epsilon: A small float number to avoid dividing by 0.
+//	scale_after_normalization: A bool indicating whether the resulted tensor
+// needs to be multiplied with gamma.
+func BatchNormWithGlobalNormalization(scope *Scope, t tf.Output, m tf.Output, v tf.Output, beta tf.Output, gamma tf.Output, variance_epsilon float32, scale_after_normalization bool) (result tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"variance_epsilon": variance_epsilon, "scale_after_normalization": scale_after_normalization}
+	opspec := tf.OpSpec{
+		Type: "BatchNormWithGlobalNormalization",
+		Input: []tf.Input{
+			t, m, v, beta, gamma,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// EncodeBase64Attr is an optional argument to EncodeBase64.
+type EncodeBase64Attr func(optionalAttr)
+
+// EncodeBase64Pad sets the optional pad attribute to value.
+//
+// value: Bool whether padding is applied at the ends.
+// If not specified, defaults to false
+func EncodeBase64Pad(value bool) EncodeBase64Attr {
+	return func(m optionalAttr) {
+		m["pad"] = value
+	}
+}
+
+// Encode strings into web-safe base64 format.
+//
+// Refer to the following article for more information on base64 format:
+// en.wikipedia.org/wiki/Base64. Base64 strings may have padding with '=' at the
+// end so that the encoded has length multiple of 4. See Padding section of the
+// link above.
+//
+// Web-safe means that the encoder uses - and _ instead of + and /.
+//
+// Arguments:
+//	input: Strings to be encoded.
+//
+// Returns Input strings encoded in base64.
+func EncodeBase64(scope *Scope, input tf.Output, optional ...EncodeBase64Attr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "EncodeBase64",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Creates a dataset that batches input elements into a SparseTensor.
 //
 // Arguments:
@@ -46524,6 +46437,141 @@ func OutfeedEnqueue(scope *Scope, input tf.Output) (o *tf.Operation) {
 	return scope.AddOperation(opspec)
 }
 
+// DecodeJpegAttr is an optional argument to DecodeJpeg.
+type DecodeJpegAttr func(optionalAttr)
+
+// DecodeJpegChannels sets the optional channels attribute to value.
+//
+// value: Number of color channels for the decoded image.
+// If not specified, defaults to 0
+func DecodeJpegChannels(value int64) DecodeJpegAttr {
+	return func(m optionalAttr) {
+		m["channels"] = value
+	}
+}
+
+// DecodeJpegRatio sets the optional ratio attribute to value.
+//
+// value: Downscaling ratio.
+// If not specified, defaults to 1
+func DecodeJpegRatio(value int64) DecodeJpegAttr {
+	return func(m optionalAttr) {
+		m["ratio"] = value
+	}
+}
+
+// DecodeJpegFancyUpscaling sets the optional fancy_upscaling attribute to value.
+//
+// value: If true use a slower but nicer upscaling of the
+// chroma planes (yuv420/422 only).
+// If not specified, defaults to true
+func DecodeJpegFancyUpscaling(value bool) DecodeJpegAttr {
+	return func(m optionalAttr) {
+		m["fancy_upscaling"] = value
+	}
+}
+
+// DecodeJpegTryRecoverTruncated sets the optional try_recover_truncated attribute to value.
+//
+// value: If true try to recover an image from truncated input.
+// If not specified, defaults to false
+func DecodeJpegTryRecoverTruncated(value bool) DecodeJpegAttr {
+	return func(m optionalAttr) {
+		m["try_recover_truncated"] = value
+	}
+}
+
+// DecodeJpegAcceptableFraction sets the optional acceptable_fraction attribute to value.
+//
+// value: The minimum required fraction of lines before a truncated
+// input is accepted.
+// If not specified, defaults to 1
+func DecodeJpegAcceptableFraction(value float32) DecodeJpegAttr {
+	return func(m optionalAttr) {
+		m["acceptable_fraction"] = value
+	}
+}
+
+// DecodeJpegDctMethod sets the optional dct_method attribute to value.
+//
+// value: string specifying a hint about the algorithm used for
+// decompression.  Defaults to "" which maps to a system-specific
+// default.  Currently valid values are ["INTEGER_FAST",
+// "INTEGER_ACCURATE"].  The hint may be ignored (e.g., the internal
+// jpeg library changes to a version that does not have that specific
+// option.)
+// If not specified, defaults to ""
+func DecodeJpegDctMethod(value string) DecodeJpegAttr {
+	return func(m optionalAttr) {
+		m["dct_method"] = value
+	}
+}
+
+// Decode a JPEG-encoded image to a uint8 tensor.
+//
+// The attr `channels` indicates the desired number of color channels for the
+// decoded image.
+//
+// Accepted values are:
+//
+// *   0: Use the number of channels in the JPEG-encoded image.
+// *   1: output a grayscale image.
+// *   3: output an RGB image.
+//
+// If needed, the JPEG-encoded image is transformed to match the requested number
+// of color channels.
+//
+// The attr `ratio` allows downscaling the image by an integer factor during
+// decoding.  Allowed values are: 1, 2, 4, and 8.  This is much faster than
+// downscaling the image later.
+//
+//
+// This op also supports decoding PNGs and non-animated GIFs since the interface is
+// the same, though it is cleaner to use `tf.io.decode_image`.
+//
+// Arguments:
+//	contents: 0-D.  The JPEG-encoded image.
+//
+// Returns 3-D with shape `[height, width, channels]`..
+func DecodeJpeg(scope *Scope, contents tf.Output, optional ...DecodeJpegAttr) (image tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "DecodeJpeg",
+		Input: []tf.Input{
+			contents,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns the number of nonzeroes of `sparse_matrix`.
+//
+// Arguments:
+//	sparse_matrix: A CSRSparseMatrix.
+//
+// Returns The number of nonzeroes of `sparse_matrix`.
+func SparseMatrixNNZ(scope *Scope, sparse_matrix tf.Output) (nnz tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseMatrixNNZ",
+		Input: []tf.Input{
+			sparse_matrix,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // LoadTPUEmbeddingProximalAdagradParametersGradAccumDebugAttr is an optional argument to LoadTPUEmbeddingProximalAdagradParametersGradAccumDebug.
 type LoadTPUEmbeddingProximalAdagradParametersGradAccumDebugAttr func(optionalAttr)
 
@@ -47233,149 +47281,6 @@ func LoadTPUEmbeddingRMSPropParameters(scope *Scope, parameters tf.Output, ms tf
 	return scope.AddOperation(opspec)
 }
 
-// Creates a Dataset that returns pseudorandom numbers.
-//
-// Creates a Dataset that returns a stream of uniformly distributed
-// pseudorandom 64-bit signed integers.
-//
-// In the TensorFlow Python API, you can instantiate this dataset via the
-// class `tf.data.experimental.RandomDataset`.
-//
-// Instances of this dataset are also created as a result of the
-// `hoist_random_uniform` static optimization. Whether this optimization is
-// performed is determined by the `experimental_optimization.hoist_random_uniform`
-// option of `tf.data.Options`.
-//
-// Arguments:
-//	seed: A scalar seed for the random number generator. If either seed or
-// seed2 is set to be non-zero, the random number generator is seeded
-// by the given seed.  Otherwise, a random seed is used.
-//	seed2: A second scalar seed to avoid seed collision.
-//
-//
-func RandomDataset(scope *Scope, seed tf.Output, seed2 tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "RandomDataset",
-		Input: []tf.Input{
-			seed, seed2,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// FractionalAvgPoolAttr is an optional argument to FractionalAvgPool.
-type FractionalAvgPoolAttr func(optionalAttr)
-
-// FractionalAvgPoolPseudoRandom sets the optional pseudo_random attribute to value.
-//
-// value: When set to True, generates the pooling sequence in a
-// pseudorandom fashion, otherwise, in a random fashion. Check paper [Benjamin
-// Graham, Fractional Max-Pooling](http://arxiv.org/abs/1412.6071) for
-// difference between pseudorandom and random.
-// If not specified, defaults to false
-func FractionalAvgPoolPseudoRandom(value bool) FractionalAvgPoolAttr {
-	return func(m optionalAttr) {
-		m["pseudo_random"] = value
-	}
-}
-
-// FractionalAvgPoolOverlapping sets the optional overlapping attribute to value.
-//
-// value: When set to True, it means when pooling, the values at the boundary
-// of adjacent pooling cells are used by both cells. For example:
-//
-// `index  0  1  2  3  4`
-//
-// `value  20 5  16 3  7`
-//
-// If the pooling sequence is [0, 2, 4], then 16, at index 2 will be used twice.
-// The result would be [41/3, 26/3] for fractional avg pooling.
-// If not specified, defaults to false
-func FractionalAvgPoolOverlapping(value bool) FractionalAvgPoolAttr {
-	return func(m optionalAttr) {
-		m["overlapping"] = value
-	}
-}
-
-// FractionalAvgPoolDeterministic sets the optional deterministic attribute to value.
-//
-// value: When set to True, a fixed pooling region will be used when
-// iterating over a FractionalAvgPool node in the computation graph. Mainly used
-// in unit test to make FractionalAvgPool deterministic.
-// If not specified, defaults to false
-func FractionalAvgPoolDeterministic(value bool) FractionalAvgPoolAttr {
-	return func(m optionalAttr) {
-		m["deterministic"] = value
-	}
-}
-
-// FractionalAvgPoolSeed sets the optional seed attribute to value.
-//
-// value: If either seed or seed2 are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func FractionalAvgPoolSeed(value int64) FractionalAvgPoolAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// FractionalAvgPoolSeed2 sets the optional seed2 attribute to value.
-//
-// value: An second seed to avoid seed collision.
-// If not specified, defaults to 0
-func FractionalAvgPoolSeed2(value int64) FractionalAvgPoolAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// Performs fractional average pooling on the input.
-//
-// Fractional average pooling is similar to Fractional max pooling in the pooling
-// region generation step. The only difference is that after pooling regions are
-// generated, a mean operation is performed instead of a max operation in each
-// pooling region.
-//
-// Arguments:
-//	value: 4-D with shape `[batch, height, width, channels]`.
-//	pooling_ratio: Pooling ratio for each dimension of `value`, currently only
-// supports row and col dimension and should be >= 1.0. For example, a valid
-// pooling ratio looks like [1.0, 1.44, 1.73, 1.0]. The first and last elements
-// must be 1.0 because we don't allow pooling on batch and channels
-// dimensions. 1.44 and 1.73 are pooling ratio on height and width dimensions
-// respectively.
-//
-// Returns:
-//	output: output tensor after fractional avg pooling.
-//	row_pooling_sequence: row pooling sequence, needed to calculate gradient.
-//	col_pooling_sequence: column pooling sequence, needed to calculate gradient.
-func FractionalAvgPool(scope *Scope, value tf.Output, pooling_ratio []float32, optional ...FractionalAvgPoolAttr) (output tf.Output, row_pooling_sequence tf.Output, col_pooling_sequence tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"pooling_ratio": pooling_ratio}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "FractionalAvgPool",
-		Input: []tf.Input{
-			value,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
 // StatefulUniformFullIntAttr is an optional argument to StatefulUniformFullInt.
 type StatefulUniformFullIntAttr func(optionalAttr)
 
@@ -47966,49 +47871,6 @@ func LoadTPUEmbeddingFTRLParameters(scope *Scope, parameters tf.Output, accumula
 	return scope.AddOperation(opspec)
 }
 
-// Conv3DBackpropInputAttr is an optional argument to Conv3DBackpropInput.
-type Conv3DBackpropInputAttr func(optionalAttr)
-
-// Conv3DBackpropInputDilations sets the optional dilations attribute to value.
-// If not specified, defaults to <i:1 i:1 i:1 i:1 i:1 >
-func Conv3DBackpropInputDilations(value []int64) Conv3DBackpropInputAttr {
-	return func(m optionalAttr) {
-		m["dilations"] = value
-	}
-}
-
-// Computes the gradients of 3-D convolution with respect to the input.
-//
-// DEPRECATED at GraphDef version 10: Use Conv3DBackpropInputV2
-//
-// Arguments:
-//	input: Shape `[batch, depth, rows, cols, in_channels]`.
-//	filter: Shape `[depth, rows, cols, in_channels, out_channels]`.
-// `in_channels` must match between `input` and `filter`.
-//	out_backprop: Backprop signal of shape `[batch, out_depth, out_rows, out_cols,
-// out_channels]`.
-//	strides: 1-D tensor of length 5. The stride of the sliding window for each
-// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-//	padding: The type of padding algorithm to use.
-func Conv3DBackpropInput(scope *Scope, input tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv3DBackpropInputAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Conv3DBackpropInput",
-		Input: []tf.Input{
-			input, filter, out_backprop,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // DepthwiseConv2dNativeAttr is an optional argument to DepthwiseConv2dNative.
 type DepthwiseConv2dNativeAttr func(optionalAttr)
 
@@ -48963,44 +48825,6 @@ func AutoShardDataset(scope *Scope, input_dataset tf.Output, num_workers tf.Outp
 	return op.Output(0)
 }
 
-// InfeedEnqueuePrelinearizedBufferAttr is an optional argument to InfeedEnqueuePrelinearizedBuffer.
-type InfeedEnqueuePrelinearizedBufferAttr func(optionalAttr)
-
-// InfeedEnqueuePrelinearizedBufferDeviceOrdinal sets the optional device_ordinal attribute to value.
-//
-// value: The TPU device to use. This should be -1 when the Op is running on a TPU device
-// and = 0 when the Op is running on the CPU device.
-// If not specified, defaults to -1
-func InfeedEnqueuePrelinearizedBufferDeviceOrdinal(value int64) InfeedEnqueuePrelinearizedBufferAttr {
-	return func(m optionalAttr) {
-		m["device_ordinal"] = value
-	}
-}
-
-// An op which enqueues prelinearized buffer into TPU infeed.
-//
-// Arguments:
-//	input: A variant tensor representing linearized output.
-//
-// Returns the created operation.
-func InfeedEnqueuePrelinearizedBuffer(scope *Scope, input tf.Output, optional ...InfeedEnqueuePrelinearizedBufferAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "InfeedEnqueuePrelinearizedBuffer",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
 // RetrieveTPUEmbeddingFTRLParametersAttr is an optional argument to RetrieveTPUEmbeddingFTRLParameters.
 type RetrieveTPUEmbeddingFTRLParametersAttr func(optionalAttr)
 
@@ -49126,6 +48950,90 @@ func ResourceApplyAdagradV2(scope *Scope, var_ tf.Output, accum tf.Output, lr tf
 	return scope.AddOperation(opspec)
 }
 
+// QuantizedMatMulWithBiasAndReluAttr is an optional argument to QuantizedMatMulWithBiasAndRelu.
+type QuantizedMatMulWithBiasAndReluAttr func(optionalAttr)
+
+// QuantizedMatMulWithBiasAndReluToutput sets the optional Toutput attribute to value.
+// If not specified, defaults to DT_QINT32
+func QuantizedMatMulWithBiasAndReluToutput(value tf.DataType) QuantizedMatMulWithBiasAndReluAttr {
+	return func(m optionalAttr) {
+		m["Toutput"] = value
+	}
+}
+
+// QuantizedMatMulWithBiasAndReluTransposeA sets the optional transpose_a attribute to value.
+//
+// value: If true, `a` is transposed before multiplication.
+// If not specified, defaults to false
+func QuantizedMatMulWithBiasAndReluTransposeA(value bool) QuantizedMatMulWithBiasAndReluAttr {
+	return func(m optionalAttr) {
+		m["transpose_a"] = value
+	}
+}
+
+// QuantizedMatMulWithBiasAndReluTransposeB sets the optional transpose_b attribute to value.
+//
+// value: If true, `b` is transposed before multiplication.
+// If not specified, defaults to false
+func QuantizedMatMulWithBiasAndReluTransposeB(value bool) QuantizedMatMulWithBiasAndReluAttr {
+	return func(m optionalAttr) {
+		m["transpose_b"] = value
+	}
+}
+
+// QuantizedMatMulWithBiasAndReluInputQuantMode sets the optional input_quant_mode attribute to value.
+//
+// value: Input data quantization mode. Either MIN_FIRST(default) or SCALED.
+// If not specified, defaults to "MIN_FIRST"
+func QuantizedMatMulWithBiasAndReluInputQuantMode(value string) QuantizedMatMulWithBiasAndReluAttr {
+	return func(m optionalAttr) {
+		m["input_quant_mode"] = value
+	}
+}
+
+// Perform a quantized matrix multiplication of  `a` by the matrix `b` with bias
+// add and relu fusion.
+//
+// The inputs must be two-dimensional matrices and 1D bias vector. And the inner
+// dimension of `a` (after being transposed if `transpose_a` is non-zero) must
+// match the outer dimension of `b` (after being transposed if `transposed_b` is
+// non-zero). Then do broadcast add operation with bias values on the matrix
+// multiplication result. The bias size must match inner dimension of `b`. Then do
+// relu activation to get non-negative result.
+//
+// Arguments:
+//	a: A matrix to be multiplied. Must be a two-dimensional tensor of type `quint8`.
+//	b: A matrix to be multiplied and must be a two-dimensional tensor of type `qint8`.
+//	bias: A 1D bias tensor with size matching with inner dimension of `b` (after being
+// transposed if `transposed_b` is non-zero).
+//	min_a: The float value that the lowest quantized `a` value represents.
+//	max_a: The float value that the highest quantized `a` value represents.
+//	min_b: The float value that the lowest quantized `b` value represents.
+//	max_b: The float value that the highest quantized `b` value represents.
+//
+// Returns:
+//	out
+//	min_out: The float value that the lowest quantized output value represents.
+//	max_out: The float value that the highest quantized output value represents.
+func QuantizedMatMulWithBiasAndRelu(scope *Scope, a tf.Output, b tf.Output, bias tf.Output, min_a tf.Output, max_a tf.Output, min_b tf.Output, max_b tf.Output, optional ...QuantizedMatMulWithBiasAndReluAttr) (out tf.Output, min_out tf.Output, max_out tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "QuantizedMatMulWithBiasAndRelu",
+		Input: []tf.Input{
+			a, b, bias, min_a, max_a, min_b, max_b,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
 // Shuts down a running distributed TPU system.
 //
 // The op returns an error if no system is running.
@@ -49734,3 +49642,60 @@ func TPUReplicatedOutput(scope *Scope, input tf.Output, num_replicas int64) (out
 	}
 	return outputs
 }
+
+// Removes keys and its associated values from a table.
+//
+// The tensor `keys` must of the same type as the keys of the table. Keys not
+// already in the table are silently ignored.
+//
+// Arguments:
+//	table_handle: Handle to the table.
+//	keys: Any shape.  Keys of the elements to remove.
+//
+// Returns the created operation.
+func LookupTableRemoveV2(scope *Scope, table_handle tf.Output, keys tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "LookupTableRemoveV2",
+		Input: []tf.Input{
+			table_handle, keys,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// NotEqualAttr is an optional argument to NotEqual.
+type NotEqualAttr func(optionalAttr)
+
+// NotEqualIncompatibleShapeError sets the optional incompatible_shape_error attribute to value.
+// If not specified, defaults to true
+func NotEqualIncompatibleShapeError(value bool) NotEqualAttr {
+	return func(m optionalAttr) {
+		m["incompatible_shape_error"] = value
+	}
+}
+
+// Returns the truth value of (x != y) element-wise.
+//
+// *NOTE*: `NotEqual` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func NotEqual(scope *Scope, x tf.Output, y tf.Output, optional ...NotEqualAttr) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "NotEqual",
+		Input: []tf.Input{
+			x, y,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
diff --git a/tensorflow/lite/build_def.bzl b/tensorflow/lite/build_def.bzl
index 285824a613f..ad43b56743a 100644
--- a/tensorflow/lite/build_def.bzl
+++ b/tensorflow/lite/build_def.bzl
@@ -251,7 +251,6 @@ def generated_test_models():
         "ceil",
         "concat",
         "constant",
-        # "control_dep", # b/150647401
         "conv",
         "conv_relu",
         "conv_relu1",
@@ -732,3 +731,49 @@ def tflite_experimental_runtime_linkopts(if_eager = [], if_non_eager = [], if_no
         ] + if_non_eager,
         if_none = [] + if_none,
     )
+
+def tflite_custom_cc_library(name, models = [], srcs = [], deps = [], visibility = ["//visibility:private"]):
+    """Generates a tflite cc library, stripping off unused operators.
+
+    This library includes the TfLite runtime as well as all operators needed for the given models.
+    Op resolver can be retrieved using tflite::CreateOpResolver method.
+
+    Args:
+        name: Str, name of the target.
+        models: List of models. This TFLite build will only include
+            operators used in these models. If the list is empty, all builtin
+            operators are included.
+        srcs: List of files implementing custom operators if any.
+        deps: Additional dependencies to build all the custom operators.
+        visibility: Visibility setting for the generated target. Default to private.
+    """
+    real_srcs = []
+    real_srcs.extend(srcs)
+    real_deps = []
+    real_deps.extend(deps)
+
+    if models:
+        gen_selected_ops(
+            name = "%s_registration" % name,
+            model = models[0],
+        )
+        real_srcs.append(":%s_registration" % name)
+        real_deps.append("//tensorflow/lite/java/src/main/native:selected_ops_jni")
+    else:
+        # Support all operators if `models` not specified.
+        real_deps.append("//tensorflow/lite/java/src/main/native")
+
+    native.cc_library(
+        name = name,
+        srcs = real_srcs,
+        copts = tflite_copts(),
+        linkopts = select({
+            "//tensorflow:windows": [],
+            "//conditions:default": ["-lm", "-ldl"],
+        }),
+        deps = depset([
+            "//tensorflow/lite:framework",
+            "//tensorflow/lite/kernels:builtin_ops",
+        ] + real_deps),
+        visibility = visibility,
+    )
diff --git a/tensorflow/lite/c/builtin_op_data.h b/tensorflow/lite/c/builtin_op_data.h
index 9e0e82bc906..232f5f95928 100644
--- a/tensorflow/lite/c/builtin_op_data.h
+++ b/tensorflow/lite/c/builtin_op_data.h
@@ -67,8 +67,9 @@ typedef struct {
 typedef enum {
   kTfLiteActNone = 0,
   kTfLiteActRelu,
-  kTfLiteActRelu1,  // min(max(-1, x), 1)
-  kTfLiteActRelu6,  // min(max(0, x), 6)
+  kTfLiteActReluN1To1,                    // min(max(-1, x), 1)
+  kTfLiteActRelu1 = kTfLiteActReluN1To1,  // kTfLiteActRelu1 will be deprecated.
+  kTfLiteActRelu6,                        // min(max(0, x), 6)
   kTfLiteActTanh,
   kTfLiteActSignBit,
   kTfLiteActSigmoid,
diff --git a/tensorflow/lite/c/common.h b/tensorflow/lite/c/common.h
index 15823784d12..9093e5d50ad 100644
--- a/tensorflow/lite/c/common.h
+++ b/tensorflow/lite/c/common.h
@@ -375,6 +375,7 @@ typedef struct TfLiteSparsity {
 
 // An tensor in the interpreter system which is a wrapper around a buffer of
 // data including a dimensionality (or NULL if not currently defined).
+#ifndef TF_LITE_STATIC_MEMORY
 typedef struct TfLiteTensor {
   // The data type specification for data stored in `data`. This affects
   // what member of `data` union should be used.
@@ -439,6 +440,51 @@ typedef struct TfLiteTensor {
   // `dims_signature` contains [1, -1, -1, 3]).
   const TfLiteIntArray* dims_signature;
 } TfLiteTensor;
+#else
+// Specific reduced TfLiteTensor struct for TF Micro runtime. This struct
+// contains only the minimum fields required to initialize and prepare a micro
+// inference graph. The fields in this struct have been ordered from
+// largest-to-smallest for optimal struct sizeof.
+//
+// NOTE: This flag is opt-in only at compile time.
+typedef struct TfLiteTensor {
+  // TODO(b/155784997): Consider consolidating these quantization fields:
+  // Quantization information. Replaces params field above.
+  TfLiteQuantization quantization;
+
+  // Quantization information.
+  TfLiteQuantizationParams params;
+
+  // A union of data pointers. The appropriate type should be used for a typed
+  // tensor based on `type`.
+  TfLitePtrUnion data;
+
+  // A pointer to a structure representing the dimensionality interpretation
+  // that the buffer should have. NOTE: the product of elements of `dims`
+  // and the element datatype size should be equal to `bytes` below.
+  TfLiteIntArray* dims;
+
+  // The number of bytes required to store the data of this Tensor. I.e.
+  // (bytes of each element) * dims[0] * ... * dims[n-1].  For example, if
+  // type is kTfLiteFloat32 and dims = {3, 2} then
+  // bytes = sizeof(float) * 3 * 2 = 4 * 3 * 2 = 24.
+  size_t bytes;
+
+  // The data type specification for data stored in `data`. This affects
+  // what member of `data` union should be used.
+  TfLiteType type;
+
+  // How memory is mapped
+  //  kTfLiteMmapRo: Memory mapped read only.
+  //  i.e. weights
+  //  kTfLiteArenaRw: Arena allocated read write memory
+  //  (i.e. temporaries, outputs).
+  TfLiteAllocationType allocation_type;
+
+  // True if the tensor is a variable.
+  bool is_variable;
+} TfLiteTensor;
+#endif  // TF_LITE_STATIC_MEMORY
 
 #ifndef TF_LITE_STATIC_MEMORY
 // Free data memory of tensor `t`.
diff --git a/tensorflow/lite/core/api/flatbuffer_conversions.cc b/tensorflow/lite/core/api/flatbuffer_conversions.cc
index 2a4dfbb6ff4..c496c456542 100644
--- a/tensorflow/lite/core/api/flatbuffer_conversions.cc
+++ b/tensorflow/lite/core/api/flatbuffer_conversions.cc
@@ -109,7 +109,7 @@ TfLiteFusedActivation ConvertActivation(ActivationFunctionType activation) {
     case ActivationFunctionType_RELU:
       return kTfLiteActRelu;
     case ActivationFunctionType_RELU_N1_TO_1:
-      return kTfLiteActRelu1;
+      return kTfLiteActReluN1To1;
     case ActivationFunctionType_RELU6:
       return kTfLiteActRelu6;
     case ActivationFunctionType_TANH:
@@ -177,6 +177,91 @@ TfLiteStatus ConvertTensorType(TensorType tensor_type, TfLiteType* type,
   }
 }
 
+// We have this parse function instead of directly returning kTfLiteOk from the
+// switch-case in ParseOpData because this function is used as part of the
+// selective registration for the OpResolver implementation in micro.
+TfLiteStatus ParseAbs(const Operator*, BuiltinOperator, ErrorReporter*,
+                      BuiltinDataAllocator*, void**) {
+  return kTfLiteOk;
+}
+
+TfLiteStatus ParseAdd(const Operator* op, BuiltinOperator,
+                      ErrorReporter* error_reporter,
+                      BuiltinDataAllocator* allocator, void** builtin_data) {
+  CheckParsePointerParams(op, error_reporter, allocator, builtin_data);
+
+  SafeBuiltinDataAllocator safe_allocator(allocator);
+  std::unique_ptr<TfLiteAddParams, SafeBuiltinDataAllocator::BuiltinDataDeleter>
+      params = safe_allocator.Allocate<TfLiteAddParams>();
+  TF_LITE_ENSURE(error_reporter, params != nullptr);
+
+  const AddOptions* schema_params = op->builtin_options_as_AddOptions();
+
+  if (schema_params != nullptr) {
+    params->activation =
+        ConvertActivation(schema_params->fused_activation_function());
+  } else {
+    // TODO(b/157480169): We should either return kTfLiteError or fill in some
+    // reasonable defaults in the params struct. We are not doing so until we
+    // better undertand the ramifications of changing the legacy behavior.
+  }
+
+  *builtin_data = params.release();
+  return kTfLiteOk;
+}
+
+TfLiteStatus ParseArgMax(const Operator* op, BuiltinOperator,
+                         ErrorReporter* error_reporter,
+                         BuiltinDataAllocator* allocator, void** builtin_data) {
+  CheckParsePointerParams(op, error_reporter, allocator, builtin_data);
+
+  SafeBuiltinDataAllocator safe_allocator(allocator);
+  std::unique_ptr<TfLiteArgMaxParams,
+                  SafeBuiltinDataAllocator::BuiltinDataDeleter>
+      params = safe_allocator.Allocate<TfLiteArgMaxParams>();
+  TF_LITE_ENSURE(error_reporter, params != nullptr);
+
+  const ArgMaxOptions* schema_params = op->builtin_options_as_ArgMaxOptions();
+
+  if (schema_params != nullptr) {
+    TF_LITE_ENSURE_STATUS(ConvertTensorType(
+        schema_params->output_type(), &params->output_type, error_reporter));
+  } else {
+    // TODO(b/157480169): We should either return kTfLiteError or fill in some
+    // reasonable defaults in the params struct. We are not doing so until we
+    // better undertand the ramifications of changing the legacy behavior.
+  }
+
+  *builtin_data = params.release();
+  return kTfLiteOk;
+}
+
+TfLiteStatus ParseArgMin(const Operator* op, BuiltinOperator,
+                         ErrorReporter* error_reporter,
+                         BuiltinDataAllocator* allocator, void** builtin_data) {
+  CheckParsePointerParams(op, error_reporter, allocator, builtin_data);
+
+  SafeBuiltinDataAllocator safe_allocator(allocator);
+  std::unique_ptr<TfLiteArgMinParams,
+                  SafeBuiltinDataAllocator::BuiltinDataDeleter>
+      params = safe_allocator.Allocate<TfLiteArgMinParams>();
+  TF_LITE_ENSURE(error_reporter, params != nullptr);
+
+  const ArgMinOptions* schema_params = op->builtin_options_as_ArgMinOptions();
+
+  if (schema_params != nullptr) {
+    TF_LITE_ENSURE_STATUS(ConvertTensorType(
+        schema_params->output_type(), &params->output_type, error_reporter));
+  } else {
+    // TODO(b/157480169): We should either return kTfLiteError or fill in some
+    // reasonable defaults in the params struct. We are not doing so until we
+    // better undertand the ramifications of changing the legacy behavior.
+  }
+
+  *builtin_data = params.release();
+  return kTfLiteOk;
+}
+
 TfLiteStatus ParseConv2D(const Operator* op, BuiltinOperator,
                          ErrorReporter* error_reporter,
                          BuiltinDataAllocator* allocator, void** builtin_data) {
@@ -430,6 +515,22 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
   SafeBuiltinDataAllocator safe_allocator(allocator);
   *builtin_data = nullptr;
   switch (op_type) {
+    case BuiltinOperator_ABS: {
+      return ParseAbs(op, op_type, error_reporter, allocator, builtin_data);
+    }
+
+    case BuiltinOperator_ADD: {
+      return ParseAdd(op, op_type, error_reporter, allocator, builtin_data);
+    }
+
+    case BuiltinOperator_ARG_MAX: {
+      return ParseArgMax(op, op_type, error_reporter, allocator, builtin_data);
+    }
+
+    case BuiltinOperator_ARG_MIN: {
+      return ParseArgMin(op, op_type, error_reporter, allocator, builtin_data);
+    }
+
     case BuiltinOperator_CONV_2D: {
       return ParseConv2D(op, op_type, error_reporter, allocator, builtin_data);
     }
@@ -586,16 +687,6 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
       *builtin_data = params.release();
       return kTfLiteOk;
     }
-    case BuiltinOperator_ADD: {
-      auto params = safe_allocator.Allocate<TfLiteAddParams>();
-      TF_LITE_ENSURE(error_reporter, params != nullptr);
-      if (const auto* schema_params = op->builtin_options_as_AddOptions()) {
-        params->activation =
-            ConvertActivation(schema_params->fused_activation_function());
-      }
-      *builtin_data = params.release();
-      return kTfLiteOk;
-    }
     case BuiltinOperator_DIV: {
       auto params = safe_allocator.Allocate<TfLiteDivParams>();
       TF_LITE_ENSURE(error_reporter, params != nullptr);
@@ -838,28 +929,6 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
       *builtin_data = params.release();
       return kTfLiteOk;
     }
-    case BuiltinOperator_ARG_MAX: {
-      auto params = safe_allocator.Allocate<TfLiteArgMaxParams>();
-      TF_LITE_ENSURE(error_reporter, params != nullptr);
-      if (const auto* schema_params = op->builtin_options_as_ArgMaxOptions()) {
-        TF_LITE_ENSURE_STATUS(ConvertTensorType(schema_params->output_type(),
-                                                &params->output_type,
-                                                error_reporter));
-      }
-      *builtin_data = params.release();
-      return kTfLiteOk;
-    }
-    case BuiltinOperator_ARG_MIN: {
-      auto params = safe_allocator.Allocate<TfLiteArgMinParams>();
-      TF_LITE_ENSURE(error_reporter, params != nullptr);
-      if (const auto* schema_params = op->builtin_options_as_ArgMinOptions()) {
-        TF_LITE_ENSURE_STATUS(ConvertTensorType(schema_params->output_type(),
-                                                &params->output_type,
-                                                error_reporter));
-      }
-      *builtin_data = params.release();
-      return kTfLiteOk;
-    }
     case BuiltinOperator_TRANSPOSE_CONV: {
       auto params = safe_allocator.Allocate<TfLiteTransposeConvParams>();
       TF_LITE_ENSURE(error_reporter, params != nullptr);
@@ -1019,7 +1088,6 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
       return kTfLiteOk;
     }
     // Below are the ops with no builtin_data structure.
-    case BuiltinOperator_ABS:
     case BuiltinOperator_BATCH_TO_SPACE_ND:
     // TODO(aselle): Implement call in BuiltinOptions, but nullptrs are
     // ok for now, since there is no call implementation either.
diff --git a/tensorflow/lite/core/api/flatbuffer_conversions.h b/tensorflow/lite/core/api/flatbuffer_conversions.h
index 78d2aca6222..a6431aa5ee1 100644
--- a/tensorflow/lite/core/api/flatbuffer_conversions.h
+++ b/tensorflow/lite/core/api/flatbuffer_conversions.h
@@ -75,6 +75,22 @@ TfLiteStatus ConvertTensorType(TensorType tensor_type, TfLiteType* type,
 // removed once we are no longer using ParseOpData for the OpResolver
 // implementation in micro.
 
+TfLiteStatus ParseAbs(const Operator* op, BuiltinOperator op_type,
+                      ErrorReporter* error_reporter,
+                      BuiltinDataAllocator* allocator, void** builtin_data);
+
+TfLiteStatus ParseAdd(const Operator* op, BuiltinOperator op_type,
+                      ErrorReporter* error_reporter,
+                      BuiltinDataAllocator* allocator, void** builtin_data);
+
+TfLiteStatus ParseArgMax(const Operator* op, BuiltinOperator op_type,
+                         ErrorReporter* error_reporter,
+                         BuiltinDataAllocator* allocator, void** builtin_data);
+
+TfLiteStatus ParseArgMin(const Operator* op, BuiltinOperator op_type,
+                         ErrorReporter* error_reporter,
+                         BuiltinDataAllocator* allocator, void** builtin_data);
+
 TfLiteStatus ParseConv2D(const Operator* op, BuiltinOperator op_type,
                          ErrorReporter* error_reporter,
                          BuiltinDataAllocator* allocator, void** builtin_data);
diff --git a/tensorflow/lite/delegates/flex/BUILD b/tensorflow/lite/delegates/flex/BUILD
index 692aa212482..99bcf05ab4a 100644
--- a/tensorflow/lite/delegates/flex/BUILD
+++ b/tensorflow/lite/delegates/flex/BUILD
@@ -247,7 +247,6 @@ tflite_flex_jni_library(
     models = [
         "//tensorflow/lite:testdata/multi_add_flex.bin",
     ],
-    visibility = ["//tensorflow/lite/android:__subpackages__"],
 )
 
 java_library(
@@ -280,6 +279,7 @@ java_test(
         "no_oss",  # Currently requires --config=monolithic, b/118895218.
         # TODO(b/121204962): Re-enable test after fixing memory leaks.
         "noasan",
+        "notsan",  # TODO(b/158651814) Re-enable after fixing racing condition.
     ],
     test_class = "org.tensorflow.lite.InterpreterFlexTest",
     visibility = ["//visibility:private"],
diff --git a/tensorflow/lite/delegates/flex/build_def.bzl b/tensorflow/lite/delegates/flex/build_def.bzl
index 400ea57f290..c48ae5744f0 100644
--- a/tensorflow/lite/delegates/flex/build_def.bzl
+++ b/tensorflow/lite/delegates/flex/build_def.bzl
@@ -1,4 +1,4 @@
-"""Generate custom library flex delegate."""
+"""Generate custom flex delegate library."""
 
 load(
     "//tensorflow:tensorflow.bzl",
@@ -17,6 +17,7 @@ load(
     "tflite_jni_binary",
     "tflite_jni_linkopts",
 )
+load("@build_bazel_rules_android//android:rules.bzl", "android_library")
 
 def generate_flex_kernel_header(
         name,
@@ -44,7 +45,7 @@ def generate_flex_kernel_header(
     list_ops_output = include_path + "/list_flex_ops"
     list_ops_tool = "//tensorflow/lite/tools:list_flex_ops_main"
     native.genrule(
-        name = "%s_custom_list_flex_ops" % name,
+        name = "%s_list_flex_ops" % name,
         srcs = models,
         outs = [list_ops_output],
         tools = [list_ops_tool],
@@ -56,7 +57,7 @@ def generate_flex_kernel_header(
     # Generate the kernel registration header file from list of flex ops.
     tool = "//tensorflow/python/tools:print_selective_registration_header"
     native.genrule(
-        name = "%s_custom_kernel_registration" % name,
+        name = "%s_kernel_registration" % name,
         srcs = [list_ops_output],
         outs = [header],
         tools = [tool],
@@ -72,10 +73,10 @@ def tflite_flex_cc_library(
         name,
         portable_tensorflow_lib = "//tensorflow/core:portable_tensorflow_lib",
         visibility = ["//visibility:public"]):
-    """A rule to generate a flex delegate with custom android and ios tensorflow libs.
+    """A rule to generate a flex delegate with custom portable tensorflow lib.
 
-    These libs should be a custom version of android_tensorflow_lib and ios_tensorflow_lib
-    and contain ops registrations and kernels. If not defined, the default libs will be used.
+    This lib should be a custom version of portable_tensorflow_lib and contains ops
+    registrations and kernels. If not defined, the default libs will be used.
 
     Args:
       name: Name of the generated rule.
@@ -110,7 +111,7 @@ def tflite_flex_cc_library(
 
 def tflite_flex_jni_library(
         name,
-        models,
+        models = [],
         visibility = ["//visibility:private"]):
     """A rule to generate a jni library listing only used operators.
 
@@ -118,24 +119,23 @@ def tflite_flex_jni_library(
     Java wrapper, so please make sure there is no naming conflicts.
 
     Args:
-      name: Name of the generated library.
-      models: TFLite models to interpret.
+      name: Prefix of the generated libraries.
+      models: TFLite models to interpret. The library will only include ops and kernels
+          to support these models. If empty, the library will include all Tensorflow
+          ops and kernels.
       visibility: visibility of the generated rules.
-
-    Returns:
-      Generate a jni library support flex ops.
     """
     portable_tensorflow_lib = "//tensorflow/core:portable_tensorflow_lib"
     if models:
         CUSTOM_KERNEL_HEADER = generate_flex_kernel_header(
-            name = "%s_custom_tf_op_headers" % name,
+            name = "%s_tf_op_headers" % name,
             models = models,
         )
 
-        # Define a custom_tensorflow_lib with selective registration.
+        # Define a custom tensorflow_lib with selective registration.
         # The library will only contain ops exist in provided models.
         native.cc_library(
-            name = "%s_custom_tensorflow_lib" % name,
+            name = "%s_tensorflow_lib" % name,
             srcs = if_mobile([
                 "//tensorflow/core:portable_op_registrations_and_gradients",
                 "//tensorflow/core/kernels:android_all_ops",
@@ -168,12 +168,12 @@ def tflite_flex_jni_library(
             ],
             alwayslink = 1,
         )
-        portable_tensorflow_lib = ":%s_custom_tensorflow_lib" % name
+        portable_tensorflow_lib = ":%s_tensorflow_lib" % name
 
-    # Define a custom_init_tensorflow that depends on the custom_tensorflow_lib.
+    # Define a custom init_tensorflow that depends on the above tensorflow_lib.
     # This will avoid the symbols re-definition errors.
     native.cc_library(
-        name = "%s_custom_init_tensorflow" % name,
+        name = "%s_init_tensorflow" % name,
         srcs = [
             "//tensorflow/lite/testing:init_tensorflow.cc",
         ],
@@ -194,37 +194,78 @@ def tflite_flex_jni_library(
         }),
     )
 
-    # Define a custom_flex_delegate that depends on custom_tensorflow_lib.
+    # Define a custom flex_delegate that depends on above tensorflow_lib.
     # This will reduce the binary size comparing to the original flex delegate.
     tflite_flex_cc_library(
-        name = "%s_custom_flex_delegate" % name,
+        name = "%s_flex_delegate" % name,
         portable_tensorflow_lib = portable_tensorflow_lib,
         visibility = visibility,
     )
 
-    # Define a custom_flex_native that depends on custom_flex_delegate and custom_init_tensorflow.
+    # Define a custom flex_native that depends on above flex_delegate and init_tensorflow.
     native.cc_library(
-        name = "%s_custom_flex_native" % name,
+        name = "%s_flex_native" % name,
         srcs = [
             "//tensorflow/lite/delegates/flex/java/src/main/native:flex_delegate_jni.cc",
         ],
         copts = tflite_copts(),
         visibility = visibility,
         deps = [
-            ":%s_custom_flex_delegate" % name,
-            "%s_custom_init_tensorflow" % name,
+            ":%s_flex_delegate" % name,
+            ":%s_init_tensorflow" % name,
             "//tensorflow/lite/java/jni",
             "//tensorflow/lite/delegates/utils:simple_delegate",
         ],
         alwayslink = 1,
     )
 
-    # Build the jni binary based on the custom_flex_native.
+    # Build the jni binary based on the above flex_native.
     # The library name is fixed as libtensorflowlite_flex_jni.so in FlexDelegate.java.
     tflite_jni_binary(
         name = "libtensorflowlite_flex_jni.so",
         linkopts = tflite_jni_linkopts(),
         deps = [
-            ":%s_custom_flex_native" % name,
+            ":%s_flex_native" % name,
         ],
     )
+
+def tflite_flex_android_library(
+        name,
+        models = [],
+        custom_package = "org.tensorflow.lite.flex",
+        visibility = ["//visibility:private"]):
+    """A rule to generate an android library based on the selective-built jni library.
+
+    Args:
+      name: name of android library.
+      models: TFLite models used for selective build. The library will only include ops
+          and kernels to support these models. If empty, the library will include all
+          Tensorflow ops and kernels.
+      custom_package: Java package for which java sources will be generated.
+      visibility: visibility of the generated rules.
+    """
+    tflite_flex_jni_library(
+        name = name,
+        models = models,
+        visibility = visibility,
+    )
+
+    native.cc_library(
+        name = "%s_native" % name,
+        srcs = ["libtensorflowlite_flex_jni.so"],
+        visibility = visibility,
+    )
+
+    android_library(
+        name = name,
+        srcs = ["//tensorflow/lite/delegates/flex/java/src/main/java/org/tensorflow/lite/flex:flex_delegate"],
+        manifest = "//tensorflow/lite/java:AndroidManifest.xml",
+        proguard_specs = ["//tensorflow/lite/java:proguard.flags"],
+        custom_package = custom_package,
+        deps = [
+            ":%s_native" % name,
+            "//tensorflow/lite/java:tensorflowlite_java",
+            "@org_checkerframework_qual",
+        ],
+        visibility = visibility,
+    )
diff --git a/tensorflow/lite/delegates/flex/delegate.cc b/tensorflow/lite/delegates/flex/delegate.cc
index 4741bddc2f5..b8b0d4e6d01 100644
--- a/tensorflow/lite/delegates/flex/delegate.cc
+++ b/tensorflow/lite/delegates/flex/delegate.cc
@@ -136,3 +136,10 @@ TfLiteStatus FlexDelegate::CopyFromBufferHandle(
 }
 
 }  // namespace tflite
+
+// Exported C interface function which is used by AcquireFlexDelegate() at
+// interpreter_build.cc. To export the function name globally, the function name
+// must be matched with patterns in tf_version_script.lds
+extern "C" tflite::TfLiteDelegateUniquePtr TF_AcquireFlexDelegate() {
+  return tflite::AcquireFlexDelegate();
+}
diff --git a/tensorflow/lite/delegates/flex/kernel.cc b/tensorflow/lite/delegates/flex/kernel.cc
index e7705ecf3ce..b3e978908bd 100644
--- a/tensorflow/lite/delegates/flex/kernel.cc
+++ b/tensorflow/lite/delegates/flex/kernel.cc
@@ -331,9 +331,12 @@ tensorflow::Status ExecuteFlexOp(TfLiteContext* context, BufferMap* buffer_map,
   node_data->mutable_outputs()->ResetTensorHandles();
   int num_retvals = node_data->NumOutputs();
   TF_RETURN_WITH_CONTEXT_IF_ERROR(
-      EagerExecute(node_data->op(),
-                   node_data->mutable_outputs()->GetTensorHandles()->data(),
-                   &num_retvals),
+      node_data->op()->Execute(
+          absl::MakeSpan(
+              reinterpret_cast<tensorflow::AbstractTensorHandle**>(
+                  node_data->mutable_outputs()->GetTensorHandles()->data()),
+              num_retvals),
+          &num_retvals),
       " (while executing '", node_data->name(), "' via Eager)");
 
   if (num_retvals != node_data->NumOutputs()) {
diff --git a/tensorflow/lite/delegates/gpu/cl/BUILD b/tensorflow/lite/delegates/gpu/cl/BUILD
index e6e7e4747c4..3e10f669d70 100644
--- a/tensorflow/lite/delegates/gpu/cl/BUILD
+++ b/tensorflow/lite/delegates/gpu/cl/BUILD
@@ -81,6 +81,7 @@ cc_library(
     deps = [
         ":cl_command_queue",
         ":cl_context",
+        ":gpu_object",
         ":opencl_wrapper",
         ":util",
         "//tensorflow/lite/delegates/gpu/common:status",
@@ -522,6 +523,7 @@ cc_library(
     deps = [
         ":cl_command_queue",
         ":cl_context",
+        ":gpu_object",
         ":opencl_wrapper",
         ":tensor_type",
         ":util",
diff --git a/tensorflow/lite/delegates/gpu/cl/arguments.cc b/tensorflow/lite/delegates/gpu/cl/arguments.cc
index 53303eab079..1fd58ef2454 100644
--- a/tensorflow/lite/delegates/gpu/cl/arguments.cc
+++ b/tensorflow/lite/delegates/gpu/cl/arguments.cc
@@ -221,8 +221,9 @@ void Arguments::AddObjectRef(const std::string& name, AccessType access_type,
 }
 
 void Arguments::AddObject(const std::string& name, AccessType access_type,
-                          GPUObjectPtr&& object) {
-  objects_[name] = {access_type, std::move(object)};
+                          GPUObjectPtr&& object,
+                          GPUObjectDescriptorPtr&& descriptor_ptr) {
+  objects_[name] = {access_type, std::move(object), std::move(descriptor_ptr)};
 }
 
 void Arguments::AddGPUResources(const std::string& name,
@@ -411,7 +412,8 @@ absl::Status Arguments::Merge(Arguments&& args, const std::string& postfix) {
       return absl::InvalidArgumentError(
           absl::StrCat("Object name collision. Name - ", name));
     }
-    objects_[name] = {v.second.access_type, std::move(v.second.obj_ptr)};
+    objects_[name] = {v.second.access_type, std::move(v.second.obj_ptr),
+                      std::move(v.second.descriptor)};
   }
   for (const auto& v : args.int_values_) {
     AddInt(RenameArg(object_names, postfix, v.first), v.second.value);
@@ -455,8 +457,20 @@ std::string Arguments::GetListOfArgs() {
   for (auto& t : buffers_) {
     const std::string type_name =
         t.second.data_type == DataType::FLOAT32 ? "float" : "half";
-    AppendArgument(absl::StrCat("__global ", type_name, t.second.element_size,
-                                "* ", t.first),
+    std::string memory_type;
+    switch (t.second.memory_type) {
+      case MemoryType::GLOBAL:
+        memory_type = "__global";
+        break;
+      case MemoryType::CONSTANT:
+        memory_type = "__constant";
+        break;
+      case MemoryType::LOCAL:
+        memory_type = "__local";
+        break;
+    }
+    AppendArgument(absl::StrCat(memory_type, " ", type_name,
+                                t.second.element_size, "* ", t.first),
                    &result);
   }
   for (auto& t : image_buffers_) {
@@ -677,7 +691,7 @@ absl::Status Arguments::ResolveSelector(
     desc_ptr = it->second.descriptor.get();
     access_type = it->second.access_type;
   } else if (auto it = objects_.find(object_name); it != objects_.end()) {
-    desc_ptr = it->second.obj_ptr->GetGPUDescriptor();
+    desc_ptr = it->second.descriptor.get();
     access_type = it->second.access_type;
   } else {
     return absl::NotFoundError(
@@ -744,6 +758,9 @@ absl::Status Arguments::ResolveSelectorsPass(
       size_t close_bracket_pos;
       RETURN_IF_ERROR(ParseArgsInsideBrackets(*code, next_position,
                                               &close_bracket_pos, &args));
+      for (auto& arg : args) {
+        RETURN_IF_ERROR(ResolveSelectorsPass({}, &arg));
+      }
       std::string patch;
       RETURN_IF_ERROR(ResolveSelector(linkables, object_name, selector_name,
                                       args, template_args, &patch));
@@ -760,8 +777,7 @@ absl::Status Arguments::ResolveSelectorsPass(
 absl::Status Arguments::AddObjectArgs() {
   for (auto& t : objects_) {
     AddGPUResources(t.first,
-                    t.second.obj_ptr->GetGPUDescriptor()->GetGPUResources(
-                        t.second.access_type));
+                    t.second.descriptor->GetGPUResources(t.second.access_type));
     RETURN_IF_ERROR(SetGPUResources(
         t.first, t.second.obj_ptr->GetGPUResources(t.second.access_type)));
   }
diff --git a/tensorflow/lite/delegates/gpu/cl/arguments.h b/tensorflow/lite/delegates/gpu/cl/arguments.h
index edeab4a603b..4bebb0b2628 100644
--- a/tensorflow/lite/delegates/gpu/cl/arguments.h
+++ b/tensorflow/lite/delegates/gpu/cl/arguments.h
@@ -50,7 +50,8 @@ class Arguments {
   void AddObjectRef(const std::string& name, AccessType access_type,
                     GPUObjectDescriptorPtr&& descriptor_ptr);
   void AddObject(const std::string& name, AccessType access_type,
-                 GPUObjectPtr&& object);
+                 GPUObjectPtr&& object,
+                 GPUObjectDescriptorPtr&& descriptor_ptr);
 
   absl::Status SetInt(const std::string& name, int value);
   absl::Status SetFloat(const std::string& name, float value);
@@ -162,6 +163,7 @@ class Arguments {
   struct ObjectArg {
     AccessType access_type;
     GPUObjectPtr obj_ptr;
+    GPUObjectDescriptorPtr descriptor;
   };
   std::map<std::string, ObjectArg> objects_;
 };
diff --git a/tensorflow/lite/delegates/gpu/cl/buffer.cc b/tensorflow/lite/delegates/gpu/cl/buffer.cc
index 207cdec5122..436d8751e18 100644
--- a/tensorflow/lite/delegates/gpu/cl/buffer.cc
+++ b/tensorflow/lite/delegates/gpu/cl/buffer.cc
@@ -44,6 +44,52 @@ absl::Status CreateBuffer(size_t size_in_bytes, bool gpu_read_only,
 }
 }  // namespace
 
+GPUResources BufferDescriptor::GetGPUResources(AccessType access_type) const {
+  GPUResources resources;
+  GPUBufferDescriptor desc;
+  desc.data_type = element_type;
+  desc.access_type = access_type;
+  desc.element_size = element_size;
+  desc.memory_type = memory_type;
+  resources.buffers.push_back({"buffer", desc});
+  return resources;
+}
+
+absl::Status BufferDescriptor::PerformSelector(
+    const std::string& selector, const std::vector<std::string>& args,
+    const std::vector<std::string>& template_args, std::string* result) const {
+  if (selector == "Read") {
+    return PerformReadSelector(args, result);
+  } else if (selector == "GetPtr") {
+    return PerformGetPtrSelector(args, result);
+  } else {
+    return absl::NotFoundError(absl::StrCat(
+        "BufferDescriptor don't have selector with name - ", selector));
+  }
+}
+
+absl::Status BufferDescriptor::PerformReadSelector(
+    const std::vector<std::string>& args, std::string* result) const {
+  if (args.size() != 1) {
+    return absl::NotFoundError(
+        absl::StrCat("BufferDescriptor Read require one argument, but ",
+                     args.size(), " was passed"));
+  }
+  *result = absl::StrCat("buffer[", args[0], "]");
+  return absl::OkStatus();
+}
+
+absl::Status BufferDescriptor::PerformGetPtrSelector(
+    const std::vector<std::string>& args, std::string* result) const {
+  if (!args.empty()) {
+    return absl::NotFoundError(
+        absl::StrCat("BufferDescriptor GetPtr require zero arguments, but ",
+                     args.size(), " was passed"));
+  }
+  *result = "buffer";
+  return absl::OkStatus();
+}
+
 Buffer::Buffer(cl_mem buffer, size_t size_in_bytes)
     : buffer_(buffer), size_(size_in_bytes) {}
 
@@ -71,6 +117,12 @@ void Buffer::Release() {
   }
 }
 
+GPUResourcesWithValue Buffer::GetGPUResources(AccessType access_type) const {
+  GPUResourcesWithValue resources;
+  resources.buffers.push_back({"buffer", buffer_});
+  return resources;
+}
+
 absl::Status CreateReadOnlyBuffer(size_t size_in_bytes, CLContext* context,
                                   Buffer* result) {
   return CreateBuffer(size_in_bytes, true, nullptr, context, result);
diff --git a/tensorflow/lite/delegates/gpu/cl/buffer.h b/tensorflow/lite/delegates/gpu/cl/buffer.h
index 84c3292084b..0d1072040c1 100644
--- a/tensorflow/lite/delegates/gpu/cl/buffer.h
+++ b/tensorflow/lite/delegates/gpu/cl/buffer.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "tensorflow/lite/delegates/gpu/cl/cl_command_queue.h"
 #include "tensorflow/lite/delegates/gpu/cl/cl_context.h"
+#include "tensorflow/lite/delegates/gpu/cl/gpu_object.h"
 #include "tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h"
 #include "tensorflow/lite/delegates/gpu/cl/util.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
@@ -28,9 +29,26 @@ namespace tflite {
 namespace gpu {
 namespace cl {
 
+struct BufferDescriptor : public GPUObjectDescriptor {
+  DataType element_type;  // FLOAT32 or FLOAT16
+  int element_size;
+  MemoryType memory_type = MemoryType::GLOBAL;
+
+  absl::Status PerformSelector(const std::string& selector,
+                               const std::vector<std::string>& args,
+                               const std::vector<std::string>& template_args,
+                               std::string* result) const override;
+
+  GPUResources GetGPUResources(AccessType access_type) const override;
+  absl::Status PerformReadSelector(const std::vector<std::string>& args,
+                                   std::string* result) const;
+  absl::Status PerformGetPtrSelector(const std::vector<std::string>& args,
+                                     std::string* result) const;
+};
+
 // Buffer represent linear GPU data storage with arbitrary data format.
 // Buffer is moveable but not copyable.
-class Buffer {
+class Buffer : public GPUObject {
  public:
   Buffer() {}  // just for using Buffer as a class members
   Buffer(cl_mem buffer, size_t size_in_bytes);
@@ -57,6 +75,8 @@ class Buffer {
   template <typename T>
   absl::Status ReadData(CLCommandQueue* queue, std::vector<T>* result) const;
 
+  GPUResourcesWithValue GetGPUResources(AccessType access_type) const override;
+
  private:
   void Release();
 
diff --git a/tensorflow/lite/delegates/gpu/cl/cl_device.cc b/tensorflow/lite/delegates/gpu/cl/cl_device.cc
index aea81d5e659..271fbce61ce 100644
--- a/tensorflow/lite/delegates/gpu/cl/cl_device.cc
+++ b/tensorflow/lite/delegates/gpu/cl/cl_device.cc
@@ -495,6 +495,39 @@ std::string CLDevice::GetPlatformVersion() const {
   return GetPlatformInfo(platform_id_, CL_PLATFORM_VERSION);
 }
 
+bool CLDevice::IsCL20OrHigher() const {
+  return info_.cl_version != OpenCLVersion::CL_1_0 &&
+         info_.cl_version != OpenCLVersion::CL_1_1 &&
+         info_.cl_version != OpenCLVersion::CL_1_2;
+}
+
+bool CLDevice::SupportsSubGroupWithSize(int sub_group_size) const {
+  if (IsIntel()) {
+    if (SupportsExtension("cl_intel_required_subgroup_size")) {
+      size_t sub_groups_count;
+      cl_int error =
+          clGetDeviceInfo(id_, 0x4108 /*CL_DEVICE_SUB_GROUP_SIZES_INTEL*/, 0,
+                          nullptr, &sub_groups_count);
+      if (error != CL_SUCCESS) {
+        return false;
+      }
+      std::vector<size_t> sub_group_sizes(sub_groups_count);
+      error = clGetDeviceInfo(id_, 0x4108 /*CL_DEVICE_SUB_GROUP_SIZES_INTEL*/,
+                              sizeof(size_t) * sub_groups_count,
+                              sub_group_sizes.data(), nullptr);
+      if (error != CL_SUCCESS) {
+        return false;
+      }
+      for (int i = 0; i < sub_groups_count; ++i) {
+        if (sub_group_sizes[i] == sub_group_size) {
+          return true;
+        }
+      }
+    }
+  }
+  return false;
+}
+
 bool CLDevice::IsAdreno() const { return info_.vendor == Vendor::QUALCOMM; }
 
 bool CLDevice::IsAdreno3xx() const {
diff --git a/tensorflow/lite/delegates/gpu/cl/cl_device.h b/tensorflow/lite/delegates/gpu/cl/cl_device.h
index 1df16aa3bad..68abcf3e202 100644
--- a/tensorflow/lite/delegates/gpu/cl/cl_device.h
+++ b/tensorflow/lite/delegates/gpu/cl/cl_device.h
@@ -178,6 +178,8 @@ class CLDevice {
   bool SupportsExtension(const std::string& extension) const;
   bool SupportsFP32RTN() const;
   bool SupportsFP16RTN() const;
+  bool IsCL20OrHigher() const;
+  bool SupportsSubGroupWithSize(int sub_group_size) const;
   bool IsAdreno() const;
   bool IsAdreno3xx() const;
   bool IsAdreno4xx() const;
diff --git a/tensorflow/lite/delegates/gpu/cl/gpu_object.h b/tensorflow/lite/delegates/gpu/cl/gpu_object.h
index fec8999e2bc..711c4726bc2 100644
--- a/tensorflow/lite/delegates/gpu/cl/gpu_object.h
+++ b/tensorflow/lite/delegates/gpu/cl/gpu_object.h
@@ -54,10 +54,13 @@ struct GPUImageBufferDescriptor {
   cl_mem memory;
 };
 
+enum class MemoryType { GLOBAL, CONSTANT, LOCAL };
+
 struct GPUBufferDescriptor {
   DataType data_type;
   AccessType access_type;
   int element_size;
+  MemoryType memory_type = MemoryType::GLOBAL;
   cl_mem memory;
 };
 
@@ -105,14 +108,6 @@ struct GPUResourcesWithValue {
 class GPUObjectDescriptor {
  public:
   GPUObjectDescriptor() = default;
-  GPUObjectDescriptor(const GPUObjectDescriptor& obj_desc)
-      : state_vars_(obj_desc.state_vars_) {}
-  GPUObjectDescriptor& operator=(const GPUObjectDescriptor& obj_desc) {
-    if (this != &obj_desc) {
-      state_vars_ = obj_desc.state_vars_;
-    }
-    return *this;
-  }
   virtual ~GPUObjectDescriptor() = default;
 
   void SetStateVar(const std::string& key, const std::string& value) const {
@@ -149,7 +144,6 @@ class GPUObject {
   GPUObject(const GPUObject&) = delete;
   GPUObject& operator=(const GPUObject&) = delete;
   virtual ~GPUObject() = default;
-  virtual const GPUObjectDescriptor* GetGPUDescriptor() const = 0;
   virtual GPUResourcesWithValue GetGPUResources(
       AccessType access_type) const = 0;
 };
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/BUILD b/tensorflow/lite/delegates/gpu/cl/kernels/BUILD
index 24a9a962296..21fb65e8909 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/BUILD
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/BUILD
@@ -594,29 +594,6 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "depthwise_conv_3d",
-    srcs = ["depthwise_conv_3d.cc"],
-    hdrs = ["depthwise_conv_3d.h"],
-    deps = [
-        ":gpu_operation",
-        ":util",
-        ":work_group_picking",
-        "//tensorflow/lite/delegates/gpu/cl:buffer",
-        "//tensorflow/lite/delegates/gpu/cl:cl_device",
-        "//tensorflow/lite/delegates/gpu/cl:linear_storage",
-        "//tensorflow/lite/delegates/gpu/cl:tensor",
-        "//tensorflow/lite/delegates/gpu/cl:texture2d",
-        "//tensorflow/lite/delegates/gpu/cl:util",
-        "//tensorflow/lite/delegates/gpu/common:data_type",
-        "//tensorflow/lite/delegates/gpu/common:operations",
-        "//tensorflow/lite/delegates/gpu/common:shape",
-        "//tensorflow/lite/delegates/gpu/common:status",
-        "//tensorflow/lite/delegates/gpu/common:tensor",
-        "//tensorflow/lite/delegates/gpu/common:types",
-    ],
-)
-
 cc_test(
     name = "depthwise_conv_test",
     srcs = ["depthwise_conv_test.cc"],
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/concat_xy.cc b/tensorflow/lite/delegates/gpu/cl/kernels/concat_xy.cc
index ef7915afba5..5476cc22965 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/concat_xy.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/concat_xy.cc
@@ -15,7 +15,9 @@ limitations under the License.
 
 #include "tensorflow/lite/delegates/gpu/cl/kernels/concat_xy.h"
 
+#include <map>
 #include <string>
+#include <vector>
 
 #include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.h"
@@ -27,51 +29,93 @@ namespace gpu {
 namespace cl {
 namespace {
 
-std::string GetConcatKernelCode(
-    const OperationDef& op_def, int tensors_count,
-    const std::vector<ElementwiseOperation*>& linked_operations) {
-  std::vector<TensorCodeGenerator> srcs(tensors_count);
-  for (int i = 0; i < tensors_count; ++i) {
-    const std::string tensor_name = "src_data_" + std::to_string(i);
-    const std::string width = "src_size_" + std::to_string(i) + ".x";
-    const std::string height = "src_size_" + std::to_string(i) + ".y";
-    srcs[i] =
-        TensorCodeGenerator(tensor_name, WHSPoint{width, height, "dst_size.z"},
-                            op_def.src_tensors[i]);
+std::string GetConcatKernelCode(const OperationDef& op_def,
+                                const ConcatAttributes& attr, Arguments* args) {
+  std::vector<std::string> tensor_names(op_def.src_tensors.size());
+  for (int i = 0; i < op_def.src_tensors.size(); ++i) {
+    tensor_names[i] = "src_tensor_" + std::to_string(i);
+    args->AddObjectRef(
+        tensor_names[i], AccessType::READ,
+        absl::make_unique<TensorDescriptor>(op_def.src_tensors[0]));
+  }
+  args->AddObjectRef(
+      "dst_tensor", AccessType::WRITE,
+      absl::make_unique<TensorDescriptor>(op_def.dst_tensors[0]));
+
+  std::map<Axis, std::string> axis_to_selector = {
+      {Axis::WIDTH, "Width"}, {Axis::HEIGHT, "Height"},
+      {Axis::DEPTH, "Depth"}, {Axis::CHANNELS, "Channels"},
+      {Axis::BATCH, "Batch"},
+  };
+  std::map<Axis, std::string> axis_to_coord = {
+      {Axis::WIDTH, "X"},    {Axis::HEIGHT, "Y"}, {Axis::DEPTH, "D"},
+      {Axis::CHANNELS, "S"}, {Axis::BATCH, "B"},
+  };
+
+  std::vector<std::string> src_coords;
+  std::vector<std::string> dst_coords;
+  for (auto axis :
+       {Axis::WIDTH, Axis::HEIGHT, Axis::DEPTH, Axis::CHANNELS, Axis::BATCH}) {
+    if (op_def.src_tensors[0].HasAxis(axis) && axis != Axis::BATCH) {
+      if (axis == attr.axis) {
+        src_coords.push_back("coord");
+      } else {
+        src_coords.push_back(axis_to_coord[axis]);
+      }
+    }
+    if (op_def.dst_tensors[0].HasAxis(axis)) {
+      dst_coords.push_back(axis_to_coord[axis]);
+    }
+  }
+  std::string src_coord = src_coords[0];
+  for (int i = 1; i < src_coords.size(); ++i) {
+    src_coord += ", " + src_coords[i];
+  }
+  std::string dst_coord = dst_coords[0];
+  for (int i = 1; i < dst_coords.size(); ++i) {
+    dst_coord += ", " + dst_coords[i];
   }
-  TensorCodeGenerator dst("dst_data",
-                          WHSPoint{"dst_size.x", "dst_size.y", "dst_size.z"},
-                          op_def.dst_tensors[0]);
 
   std::string c = GetCommonDefines(op_def.precision);
-
   c += "__kernel void main_function(\n";
-  for (const auto& src : srcs) {
-    c += src.GetDeclaration(AccessType::READ) + ",\n";
+  c += "$0) {\n";
+  if (op_def.dst_tensors[0].HasAxis(Axis::BATCH)) {
+    c += "  int linear_id_0 = get_global_id(0);\n";
+    c += "  int X = linear_id_0 / args.dst_tensor.Batch();\n";
+    c += "  int B = linear_id_0 % args.dst_tensor.Batch();\n";
+  } else {
+    c += "  int X = get_global_id(0);\n";
   }
-  c += dst.GetDeclaration(AccessType::WRITE);
-  c += GetArgsDeclaration(linked_operations);
-  for (int i = 0; i < tensors_count; ++i) {
-    const std::string uniform_name = "src_size_" + std::to_string(i);
-    c += "    int4 " + uniform_name + ",\n";
+  if (op_def.dst_tensors[0].HasAxis(Axis::DEPTH)) {
+    c += "  int linear_id_1 = get_global_id(1);\n";
+    c += "  int Y = linear_id_1 / args.dst_tensor.Depth();\n";
+    c += "  int D = linear_id_1 % args.dst_tensor.Depth();\n";
+  } else {
+    c += "  int Y = get_global_id(1);\n";
   }
-  c += "    int4 dst_size  \n";
-  c += ") {\n";
-  c += "  int X = get_global_id(0);\n";
-  c += "  int Y = get_global_id(1);\n";
-  c += "  int Z = get_global_id(2);\n";
-  c += "  if (Z >= dst_size.z) return;\n";
-  for (int i = 0; i < tensors_count; ++i) {
-    const std::string size_name = "src_size_" + std::to_string(i);
-    c += "  if (X < " + size_name + ".x && Y < " + size_name + ".y) { \n";
-    c += "    FLT4 result = " + srcs[i].ReadWHS("X", "Y", "Z") + ";\n";
-    c += "    int dst_x = X + " + size_name + ".z;\n";
-    c += "    int dst_y = Y + " + size_name + ".w;\n";
-    const LinkingContext context{"result", "dst_x", "dst_y", "Z"};
-    c += PostProcess(linked_operations, context);
-    c += "    " + dst.WriteWHS("result", "dst_x", "dst_y", "Z");
+  c += "  int S = get_global_id(2);\n";
+  c += "  if (X >= args.dst_tensor.Width() || Y >= args.dst_tensor.Height() || "
+       "S >= args.dst_tensor.Slices()) { \n";
+  c += "    return; \n";
+  c += "  } \n";
+  c += "  FLT4 result = (FLT4)(0.0f);\n";
+  c += "  int coord = " + axis_to_coord[attr.axis] + ";\n";
+  for (int i = 0; i < op_def.src_tensors.size(); ++i) {
+    const std::string field =
+        "args." + tensor_names[i] + "." + axis_to_selector[attr.axis] + "()";
+    c += "  if (coord >= 0 && coord < " + field + ") { \n";
+    if (op_def.src_tensors[i].HasAxis(Axis::BATCH)) {
+      if (attr.axis == Axis::BATCH) {
+        c += "  args." + tensor_names[i] + ".SetBatchRef(coord);\n";
+      } else {
+        c += "  args." + tensor_names[i] + ".SetBatchRef(B);\n";
+      }
+    }
+    c += "    result = args." + tensor_names[i] + ".Read(" + src_coord + ");\n";
     c += "  } \n";
+    c += "  coord -= " + field + ";\n";
   }
+  c += "  args.dst_tensor.Write(result, " + dst_coord + ");\n";
   c += "}\n";
   return c;
 }
@@ -97,46 +141,32 @@ ConcatXY& ConcatXY::operator=(ConcatXY&& operation) {
 }
 
 absl::Status ConcatXY::Compile(const CreationContext& creation_context) {
-  const auto code =
-      GetConcatKernelCode(definition_, tensors_count_, linked_operations_);
+  std::string code = GetConcatKernelCode(definition_, attr_, &args_);
+  std::string element_wise_code;
+  RETURN_IF_ERROR(
+      MergeOperations(linked_operations_, &args_, &element_wise_code));
+  RETURN_IF_ERROR(args_.TransformToCLCode(creation_context.device->GetInfo(),
+                                          {{"dst_tensor", element_wise_code}},
+                                          &code));
   return creation_context.cache->GetOrCreateCLKernel(
       code, "main_function", *creation_context.context,
       *creation_context.device, &kernel_);
 }
 
 absl::Status ConcatXY::BindArguments() {
-  kernel_.ResetBindingCounter();
-  for (int i = 0; i < tensors_count_; ++i) {
-    RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[i]->GetMemoryPtr()));
-  }
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtrForWriting()));
-  RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
-  int x_offset = 0;
-  int y_offset = 0;
-  for (int i = 0; i < tensors_count_; ++i) {
-    const int width = src_[i]->Width() * src_[i]->Batch();
-    const int height = src_[i]->Height();
+  for (int i = 0; i < definition_.src_tensors.size(); ++i) {
     RETURN_IF_ERROR(
-        kernel_.SetBytesAuto(int4(width, height, x_offset, y_offset)));
-    x_offset += attr_.axis == Axis::WIDTH ? width : 0;
-    y_offset += attr_.axis == Axis::HEIGHT ? height : 0;
+        args_.SetObjectRef("src_tensor_" + std::to_string(i), src_[i]));
   }
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetWBatchedHSB()));
-  return absl::OkStatus();
+  RETURN_IF_ERROR(args_.SetObjectRef("dst_tensor", dst_[0]));
+  RETURN_IF_ERROR(SetArguments(linked_operations_, &args_));
+  return args_.Bind(kernel_.kernel());
 }
 
 int3 ConcatXY::GetGridSize() const {
-  int max_src_width = 0;
-  int max_src_height = 0;
-  for (int i = 0; i < tensors_count_; ++i) {
-    max_src_width = std::max(max_src_width, src_[i]->Width());
-    max_src_height = std::max(max_src_height, src_[i]->Height());
-  }
-
-  const int grid_x = max_src_width * dst_[0]->Batch();
-  const int grid_y = max_src_height;
+  const int grid_x = dst_[0]->Width() * dst_[0]->Batch();
+  const int grid_y = dst_[0]->Height() * dst_[0]->Depth();
   const int grid_z = dst_[0]->Slices();
-
   return int3(grid_x, grid_y, grid_z);
 }
 
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/concat_z.cc b/tensorflow/lite/delegates/gpu/cl/kernels/concat_z.cc
index f1970cef645..08c18907c78 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/concat_z.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/concat_z.cc
@@ -36,73 +36,61 @@ bool IsAllChannelsX4(const std::vector<int>& channels) {
   return true;
 }
 
-std::string GetSrcDepthSizeVar(int src_index) {
-  return "src_size_" + std::to_string(src_index) + "_depth";
-}
-
-std::string GetConcatKernelCode(
-    const OperationDef& op_def, const std::vector<int>& channels,
-    const std::vector<ElementwiseOperation*>& linked_operations) {
-  std::vector<TensorCodeGenerator> srcs(channels.size());
-  for (int i = 0; i < channels.size(); ++i) {
-    const std::string tensor_name = "src_data_" + std::to_string(i);
-    srcs[i] = TensorCodeGenerator(
-        tensor_name,
-        WHSPoint{"dst_size.x", "dst_size.y", GetSrcDepthSizeVar(i)},
-        op_def.src_tensors[i]);
+std::string GetConcatKernelCode(const OperationDef& op_def,
+                                const std::vector<int>& channels,
+                                Arguments* args) {
+  std::vector<std::string> tensor_names(op_def.src_tensors.size());
+  for (int i = 0; i < op_def.src_tensors.size(); ++i) {
+    tensor_names[i] = "src_tensor_" + std::to_string(i);
+    auto src_desc = absl::make_unique<TensorDescriptor>(op_def.src_tensors[i]);
+    if (op_def.IsBatchSupported()) {
+      src_desc->SetStateVar("BatchedWidth", "true");
+    }
+    args->AddObjectRef(tensor_names[i], AccessType::READ, std::move(src_desc));
   }
-  TensorCodeGenerator dst("dst_data",
-                          WHSPoint{"dst_size.x", "dst_size.y", "dst_size.z"},
-                          op_def.dst_tensors[0]);
+  auto dst_desc = absl::make_unique<TensorDescriptor>(op_def.dst_tensors[0]);
+  if (op_def.IsBatchSupported()) {
+    dst_desc->SetStateVar("BatchedWidth", "true");
+  }
+  args->AddObjectRef("dst_tensor", AccessType::WRITE, std::move(dst_desc));
 
   std::string c = GetCommonDefines(op_def.precision);
-  const std::string postfix[] = {".x", ".y", ".z", ".w"};
-
   c += "__kernel void main_function(\n";
-  for (const auto& src : srcs) {
-    c += src.GetDeclaration(AccessType::READ) + ",\n";
-  }
-  c += dst.GetDeclaration(AccessType::WRITE);
-  c += GetArgsDeclaration(linked_operations);
-  for (int i = 0; i < channels.size(); ++i) {
-    c += "    int " + GetSrcDepthSizeVar(i) + ",\n";
-  }
-  c += "    int4 dst_size\n";
-  c += ") {\n";
+  c += "$0) {\n";
   c += "  int X = get_global_id(0);\n";
   c += "  int Y = get_global_id(1);\n";
-  c += "  if (X >= dst_size.x || Y >= dst_size.y) return; \n";
+  std::string coords = "X, Y";
+  if (op_def.dst_tensors[0].HasAxis(Axis::DEPTH)) {
+    c += "  int Z = get_global_id(2);\n";
+    c += "  if (Z >= args.dst_tensor.Depth()) return;\n";
+    coords = "X, Y, Z";
+  }
+  c += "  if (X >= args.dst_tensor.Width() || Y >= args.dst_tensor.Height()) "
+       "return; \n";
 
   if (IsAllChannelsX4(channels)) {
     // When all channels % 4 == 0 we can read/assign/write FLT4 elements easily.
     // Also it is easy to write a loop in this case, to prevent long kernel
     // generation.
-    c += "  int Z = 0;\n";
+    c += "  int S = 0;\n";
     for (int i = 0; i < channels.size(); ++i) {
+      std::string t_name = "args." + tensor_names[i];
       const int depth = DivideRoundUp(channels[i], 4);
       if (depth % 2 == 0) {
         // We can read more at once inside of loop in case depth % 2 == 0
         // it should be better for reading latency hiding
-        c += "  for (int i = 0; i < " + GetSrcDepthSizeVar(i) + "; i += 2) {\n";
-        c += "    FLT4 result0 = " + srcs[i].ReadWHS("X", "Y", "i") + ";\n";
-        c += "    FLT4 result1 = " + srcs[i].ReadWHS("X", "Y", "i + 1") + ";\n";
-        c += "    " + dst.GetAddressWHS("dst_adr0", "X", "Y", "Z") + "\n";
-        c += "    " + dst.GetAddressWHS("dst_adr1", "X", "Y", "Z + 1") + "\n";
-        const LinkingContext context_0{"result0", "X", "Y", "Z"};
-        const LinkingContext context_1{"result1", "X", "Y", "Z + 1"};
-        c += PostProcess(linked_operations, context_0);
-        c += PostProcess(linked_operations, context_1);
-        c += "    " + dst.WriteWHS("result0", "X", "Y", "Z");
-        c += "    " + dst.WriteWHS("result1", "X", "Y", "Z + 1");
-        c += "    Z += 2;\n";
+        c += "  for (int i = 0; i < " + t_name + ".Slices(); i += 2) {\n";
+        c += "    FLT4 result0 = " + t_name + ".Read(" + coords + ", i);\n";
+        c += "    FLT4 result1 = " + t_name + ".Read(" + coords + ", i + 1);\n";
+        c += "    args.dst_tensor.Write(result0, " + coords + ", S);\n";
+        c += "    args.dst_tensor.Write(result1, " + coords + ", S + 1);\n";
+        c += "    S += 2;\n";
         c += "  }\n";
       } else {
-        c += "  for (int i = 0; i < " + GetSrcDepthSizeVar(i) + "; ++i) {\n";
-        c += "    FLT4 result = " + srcs[i].ReadWHS("X", "Y", "i") + ";\n";
-        const LinkingContext context{"result", "X", "Y", "Z"};
-        c += PostProcess(linked_operations, context);
-        c += "    " + dst.WriteWHS("result", "X", "Y", "Z");
-        c += "    Z++;\n";
+        c += "  for (int i = 0; i < " + t_name + ".Slices(); ++i) {\n";
+        c += "    FLT4 result = " + t_name + ".Read(" + coords + ", i);\n";
+        c += "    args.dst_tensor.Write(result, " + coords + ", S);\n";
+        c += "    S++;\n";
         c += "  }\n";
       }
     }
@@ -111,24 +99,23 @@ std::string GetConcatKernelCode(
     int out_channel = 0;
     int read_index = 0;
     int z = 0;
+    const std::string postfix[] = {".x", ".y", ".z", ".w"};
     for (int i = 0; i < channels.size(); ++i) {
+      std::string tensor_name = "args." + tensor_names[i];
       const int depth = DivideRoundUp(channels[i], 4);
       for (int d = 0; d < depth; ++d) {
         const int channels_in_group = std::min(4, channels[i] - d * 4);
         const std::string temp_name = "t" + std::to_string(read_index);
-        c += "  FLT4 " + temp_name + " = ";
-        c += srcs[i].ReadWHS("X", "Y", std::to_string(d)) + ";\n";
+        c += "  FLT4 " + temp_name + " = " + tensor_name + ".Read(" + coords +
+             ", " + std::to_string(d) + ");\n";
         for (int ch = 0; ch < channels_in_group; ++ch) {
           c += "  result" + postfix[out_channel] + " = ";
           c += temp_name + postfix[ch] + ";\n";
           out_channel++;
           if (out_channel == 4) {
             out_channel = 0;
-            c += "  {\n";
-            const LinkingContext context{"result", "X", "Y", std::to_string(z)};
-            c += PostProcess(linked_operations, context);
-            c += "  " + dst.WriteWHS("result", "X", "Y", std::to_string(z));
-            c += "  }\n";
+            c += "  args.dst_tensor.Write(result, " + coords + ", " +
+                 std::to_string(z) + ");\n";
             z++;
           }
         }
@@ -136,11 +123,8 @@ std::string GetConcatKernelCode(
       }
     }
     if (out_channel != 0) {
-      c += "  {\n";
-      const LinkingContext context{"result", "X", "Y", std::to_string(z)};
-      c += PostProcess(linked_operations, context);
-      c += "  " + dst.WriteWHS("result", "X", "Y", std::to_string(z));
-      c += "  }\n";
+      c += "  args.dst_tensor.Write(result, " + coords + ", " +
+           std::to_string(z) + ");\n";
     }
   }
   c += "}\n";
@@ -166,8 +150,7 @@ ConcatZ& ConcatZ::operator=(ConcatZ&& kernel) {
 }
 
 absl::Status ConcatZ::Compile(const CreationContext& creation_context) {
-  const auto code =
-      GetConcatKernelCode(definition_, channels_, linked_operations_);
+  std::string code = GetConcatKernelCode(definition_, channels_, &args_);
   std::vector<CompilerOptions> options;
   if (creation_context.device->IsPowerVR() &&
       definition_.precision == CalculationsPrecision::F32 &&
@@ -179,32 +162,34 @@ absl::Status ConcatZ::Compile(const CreationContext& creation_context) {
       definition_.precision != CalculationsPrecision::F32 &&
       definition_.src_tensors[0].storage_type != TensorStorageType::BUFFER &&
       !IsAllChannelsX4(channels_)) {
-    // BUG, some AMD gpus crashe without it
+    // BUG, some AMD gpus crash without it
     options.push_back(CompilerOptions::CL_OPT_DISABLE);
   }
+  std::string element_wise_code;
+  RETURN_IF_ERROR(
+      MergeOperations(linked_operations_, &args_, &element_wise_code));
+  RETURN_IF_ERROR(args_.TransformToCLCode(creation_context.device->GetInfo(),
+                                          {{"dst_tensor", element_wise_code}},
+                                          &code));
   return creation_context.cache->GetOrCreateCLKernel(
       code, "main_function", options, *creation_context.context,
       *creation_context.device, &kernel_);
 }
 
 absl::Status ConcatZ::BindArguments() {
-  kernel_.ResetBindingCounter();
-  for (int i = 0; i < channels_.size(); ++i) {
-    RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[i]->GetMemoryPtr()));
+  for (int i = 0; i < definition_.src_tensors.size(); ++i) {
+    RETURN_IF_ERROR(
+        args_.SetObjectRef("src_tensor_" + std::to_string(i), src_[i]));
   }
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtrForWriting()));
-  RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
-  for (int i = 0; i < channels_.size(); ++i) {
-    RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[i]->Slices()));
-  }
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetWBatchedHSB()));
-  return absl::OkStatus();
+  RETURN_IF_ERROR(args_.SetObjectRef("dst_tensor", dst_[0]));
+  RETURN_IF_ERROR(SetArguments(linked_operations_, &args_));
+  return args_.Bind(kernel_.kernel());
 }
 
 int3 ConcatZ::GetGridSize() const {
   const int grid_x = dst_[0]->Width() * dst_[0]->Batch();
   const int grid_y = dst_[0]->Height();
-  const int grid_z = 1;
+  const int grid_z = dst_[0]->Depth();
   return int3(grid_x, grid_y, grid_z);
 }
 
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_constants.cc b/tensorflow/lite/delegates/gpu/cl/kernels/conv_constants.cc
index d4dc206ffce..e6fc5da36a2 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_constants.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_constants.cc
@@ -27,16 +27,29 @@ namespace gpu {
 namespace cl {
 namespace {
 
-std::string GenerateConvolutionConstantCode(
-    const OperationDef& op_def, const int2& kernel_size, int src_channels,
-    int dst_channels, bool stride_correction, const CLDevice& device,
-    const std::vector<ElementwiseOperation*>& linked_operations) {
-  TensorCodeGenerator src_tensor(
-      "src_data", WHSPoint{"src_size.x", "src_size.y", "src_size.z"},
-      op_def.src_tensors[0]);
-  TensorCodeGenerator dst_tensor(
-      "dst_data", WHSPoint{"dst_size.x", "dst_size.y", "dst_size.z"},
-      op_def.dst_tensors[0]);
+std::string GenerateConvolutionConstantCode(const OperationDef& op_def,
+                                            const int2& kernel_size,
+                                            int src_channels, int dst_channels,
+                                            bool stride_correction,
+                                            const CLDevice& device,
+                                            Arguments* args) {
+  auto src_desc = absl::make_unique<TensorDescriptor>(op_def.src_tensors[0]);
+  src_desc->SetTextureAddressMode(GetFastestZeroMode(device));
+  if (op_def.IsBatchSupported()) {
+    src_desc->SetStateVar("BatchedWidth", "true");
+  }
+  args->AddObjectRef("src_tensor", AccessType::READ, std::move(src_desc));
+  auto dst_desc = absl::make_unique<TensorDescriptor>(op_def.dst_tensors[0]);
+  if (op_def.IsBatchSupported()) {
+    dst_desc->SetStateVar("BatchedWidth", "true");
+  }
+  args->AddObjectRef("dst_tensor", AccessType::WRITE, std::move(dst_desc));
+  args->AddInt("stride_x");
+  args->AddInt("stride_y");
+  args->AddInt("padding_x");
+  args->AddInt("padding_y");
+  args->AddInt("dilation_x");
+  args->AddInt("dilation_y");
 
   std::string c = GetCommonDefines(op_def.precision);
 
@@ -89,33 +102,24 @@ std::string GenerateConvolutionConstantCode(
   const std::string postfixes[] = {".x", ".xy", ".xyz", ""};
 
   c += "__kernel void main_function(\n";
-  c += src_tensor.GetDeclaration(AccessType::READ) + ",\n";
-  c += "    __constant FLT4* filters,  \n";
-  c += "    __constant FLT4* biases";
-  c += GetArgsDeclaration(linked_operations);
-  c += dst_tensor.GetDeclaration(AccessType::WRITE) + ",\n";
-  c += "    int2 stride,               \n";
-  c += "    int2 padding,              \n";
-  c += "    int2 dilation,             \n";
-  c += "    int4 src_size,             \n";
-  c += "    int4 dst_size              \n";
-  c += ") {\n";
+  c += "$0) {\n";
   c += "  int X = get_global_id(0);\n";
   c += "  int Y = get_global_id(1);\n";
-  c += "  if (X >= dst_size.x || Y >= dst_size.y) return;\n";
+  c += "  if (X >= args.dst_tensor.Width() || Y >= args.dst_tensor.Height()) "
+       "return;\n";
   if (stride_correction) {
     c += "  int start_x = " +
-         GetXStrideCorrected("X", "src_size.w", "stride.x", "padding.x") +
+         GetXStrideCorrected("X", "args.src_tensor.Batch()", "args.stride_x",
+                             "args.padding_x") +
          ";\n";
   } else {
-    c += "  int start_x = X * stride.x + padding.x;\n";
+    c += "  int start_x = X * args.stride_x + args.padding_x;\n";
   }
-  c += "  int start_y = Y * stride.y + padding.y;\n";
+  c += "  int start_y = Y * args.stride_y + args.padding_y;\n";
   c += "  ACCUM_FLT4 r[" + kOutZ + "];\n";
   c += "  for (int i = 0; i < " + kOutZ + "; ++i) {\n";
   c += "    r[i] = (ACCUM_FLT4)(0.0f, 0.0f, 0.0f, 0.0f);\n";
   c += "  }\n";
-  const auto address_mode = GetFastestZeroMode(device);
   int filters_counter = 0;
   for (int s = 0; s < src_depth; ++s) {
     const int ch_count = std::min(4, src_channels - s * 4);
@@ -124,27 +128,29 @@ std::string GenerateConvolutionConstantCode(
     const std::string s_type = absl::StrCat("FLT", s_count);
     const std::string s_postfix = postfixes[ch_count - 1];
     for (int ky = 0; ky < kernel_size.y; ++ky) {
-      std::string s_y = absl::StrCat("(start_y + ", ky, " * dilation.y)");
+      std::string s_y = absl::StrCat("(start_y + ", ky, " * args.dilation_y)");
       if (manual_clamp) {
         c += "  {\n";
-        c += "  bool y_out = " + s_y + " < 0 || " + s_y + " >= src_size.y;\n";
+        c += "  bool y_out = " + s_y + " < 0 || " + s_y +
+             " >= args.src_tensor.Height();\n";
       }
       for (int kx = 0; kx < kernel_size.x; ++kx) {
         c += "  {\n";
-        std::string s_x = absl::StrCat("(start_x + ", kx, " * dilation.x)");
+        std::string s_x =
+            absl::StrCat("(start_x + ", kx, " * args.dilation_x)");
         if (manual_clamp) {
-          c += "    bool x_out = " + s_x + "< 0 || " + s_x + ">= src_size.x;\n";
+          c += "    bool x_out = " + s_x + "< 0 || " + s_x +
+               ">= args.src_tensor.Width();\n";
           c += "    " + s_type + " src = x_out || y_out ?";
-          c += "(" + s_type + ")(0.0) : ";
-          c += src_tensor.ReadWHS(s_x, s_y, std::to_string(s)) + s_postfix +
-               ";\n";
+          c += "(" + s_type + ")(0.0) : args.src_tensor.Read(" + s_x + ", " +
+               s_y + ", " + std::to_string(s) + ")" + s_postfix + ";\n";
         } else {
-          c += "    " + s_type + " src = " +
-               src_tensor.ReadWHS(s_x, s_y, std::to_string(s), address_mode) +
-               s_postfix + ";\n";
+          c += "    " + s_type + " src = args.src_tensor.Read(" + s_x + ", " +
+               s_y + ", " + std::to_string(s) + ")" + s_postfix + ";\n";
         }
         for (int d = 0; d < out_z; ++d) {
-          c += "    " + s_conv + "(r[" + std::to_string(d) + "], src, filters,";
+          c += "    " + s_conv + "(r[" + std::to_string(d) +
+               "], src, args.weigths.GetPtr(),";
           c += " " + std::to_string(filters_counter) + ");\n";
           filters_counter += ch_count;
         }
@@ -158,10 +164,9 @@ std::string GenerateConvolutionConstantCode(
   for (int i = 0; i < out_z; ++i) {
     std::string s_i = std::to_string(i);
     c += "  {\n";
-    c += "    FLT4 res = TO_FLT4(r[" + s_i + "]) + biases[" + s_i + "];\n";
-    const LinkingContext context{"res", "X", "Y", s_i};
-    c += PostProcess(linked_operations, context);
-    c += "  " + dst_tensor.WriteWHS("res", "X", "Y", s_i);
+    c += "    FLT4 res = TO_FLT4(r[" + s_i + "]) + args.biases.Read(" + s_i +
+         ");\n";
+    c += "  args.dst_tensor.Write(res, X, Y, " + s_i + ");\n";
     c += "  }\n";
   }
   c += "}\n";
@@ -191,8 +196,6 @@ int GetOptimalMaxConstantSize(const DeviceInfo& info) {
 
 ConvConstants::ConvConstants(ConvConstants&& kernel)
     : GPUOperation(std::move(kernel)),
-      weights_(std::move(kernel.weights_)),
-      biases_(std::move(kernel.biases_)),
       kernel_size_(kernel.kernel_size_),
       stride_(kernel.stride_),
       padding_(kernel.padding_),
@@ -204,8 +207,6 @@ ConvConstants::ConvConstants(ConvConstants&& kernel)
 
 ConvConstants& ConvConstants::operator=(ConvConstants&& kernel) {
   if (this != &kernel) {
-    weights_ = std::move(kernel.weights_);
-    biases_ = std::move(kernel.biases_);
     std::swap(kernel_size_, kernel.kernel_size_);
     std::swap(stride_, kernel.stride_);
     std::swap(padding_, kernel.padding_);
@@ -222,9 +223,15 @@ ConvConstants& ConvConstants::operator=(ConvConstants&& kernel) {
 absl::Status ConvConstants::Compile(const CreationContext& creation_context) {
   const bool stride_correction =
       definition_.IsBatchSupported() && stride_.x != 1;
-  const auto code = GenerateConvolutionConstantCode(
+  std::string code = GenerateConvolutionConstantCode(
       definition_, kernel_size_, src_channels_, dst_channels_,
-      stride_correction, *creation_context.device, linked_operations_);
+      stride_correction, *creation_context.device, &args_);
+  std::string element_wise_code;
+  RETURN_IF_ERROR(
+      MergeOperations(linked_operations_, &args_, &element_wise_code));
+  RETURN_IF_ERROR(args_.TransformToCLCode(creation_context.device->GetInfo(),
+                                          {{"dst_tensor", element_wise_code}},
+                                          &code));
   std::vector<CompilerOptions> options;
   if (definition_.precision == CalculationsPrecision::F16 &&
       creation_context.device->IsAdreno3xx()) {
@@ -241,20 +248,16 @@ absl::Status ConvConstants::Compile(const CreationContext& creation_context) {
 }
 
 absl::Status ConvConstants::BindArguments() {
-  kernel_.ResetBindingCounter();
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(weights_.GetMemoryPtr()));
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(biases_.GetMemoryPtr()));
-  RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtrForWriting()));
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(stride_));
-  RETURN_IF_ERROR(
-      kernel_.SetBytesAuto(int2(padding_.x * src_[0]->Batch(), padding_.y)));
-  RETURN_IF_ERROR(
-      kernel_.SetBytesAuto(int2(dilation_.x * src_[0]->Batch(), dilation_.y)));
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetWBatchedHSB()));
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetWBatchedHSB()));
-  return absl::OkStatus();
+  RETURN_IF_ERROR(args_.SetObjectRef("src_tensor", src_[0]));
+  RETURN_IF_ERROR(args_.SetObjectRef("dst_tensor", dst_[0]));
+  RETURN_IF_ERROR(args_.SetInt("stride_x", stride_.x));
+  RETURN_IF_ERROR(args_.SetInt("stride_y", stride_.y));
+  RETURN_IF_ERROR(args_.SetInt("padding_x", padding_.x * src_[0]->Batch()));
+  RETURN_IF_ERROR(args_.SetInt("padding_y", padding_.y));
+  RETURN_IF_ERROR(args_.SetInt("dilation_x", dilation_.x * src_[0]->Batch()));
+  RETURN_IF_ERROR(args_.SetInt("dilation_y", dilation_.y));
+  RETURN_IF_ERROR(SetArguments(linked_operations_, &args_));
+  return args_.Bind(kernel_.kernel());
 }
 
 int3 ConvConstants::GetGridSize() const {
@@ -304,12 +307,18 @@ absl::Status CreateConvConstants(const CreationContext& creation_context,
   *result = ConvConstants(definition, attr);
   RETURN_IF_ERROR(
       result->UploadWeights(attr.weights, creation_context.context));
-  LinearStorageCreateInfo create_info;
-  create_info.storage_type = LinearStorageType::BUFFER;
-  create_info.data_type = definition.GetDataType();
-  create_info.aligned_size = attr.weights.shape.o;
-  RETURN_IF_ERROR(CreateLinearStorage(
-      create_info, attr.bias, creation_context.context, &result->biases_));
+
+  TensorLinearDescriptor desc;
+  desc.storage_type = LinearStorageType::BUFFER;
+  desc.element_type = definition.GetDataType();
+  desc.memory_type = MemoryType::CONSTANT;
+
+  LinearStorage lt;
+  RETURN_IF_ERROR(
+      CreateLinearStorage(desc, attr.bias, creation_context.context, &lt));
+  result->args_.AddObject("biases", AccessType::READ,
+                          absl::make_unique<LinearStorage>(std::move(lt)),
+                          absl::make_unique<TensorLinearDescriptor>(desc));
   return absl::OkStatus();
 }
 
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_constants.h b/tensorflow/lite/delegates/gpu/cl/kernels/conv_constants.h
index 8d80d48314d..b9cc52f7e94 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_constants.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_constants.h
@@ -71,9 +71,6 @@ class ConvConstants : public GPUOperation {
   absl::Status BindArguments();
   int3 GetGridSize() const;
 
-  Buffer weights_;
-  LinearStorage biases_;
-
   int2 kernel_size_;
   int2 stride_;
   int2 padding_;
@@ -92,21 +89,34 @@ absl::Status ConvConstants::UploadWeights(
   const int kernel_x = weights.shape.w;
   const int kernel_y = weights.shape.h;
 
-  const int float_size =
-      definition_.precision == CalculationsPrecision::F32 ? 4 : 2;
+  const bool f32_weights = definition_.precision == CalculationsPrecision::F32;
+
+  BufferDescriptor desc;
+  desc.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
+  desc.element_size = 4;
+  desc.memory_type = MemoryType::CONSTANT;
+
+  const int float_size = f32_weights ? 4 : 2;
   const int float_count = src_channels_ * dst_depth * 4 * kernel_x * kernel_y;
 
-  if (definition_.GetDataType() == DataType::FLOAT32) {
+  Buffer weights_buffer;
+  if (f32_weights) {
     std::vector<float4> gpu_data(float_count / 4);
     RearrangeWeightsData(weights, absl::MakeSpan(gpu_data));
-    return CreateReadOnlyBuffer(float_size * float_count, gpu_data.data(),
-                                context, &weights_);
+    RETURN_IF_ERROR(CreateReadOnlyBuffer(
+        float_size * float_count, gpu_data.data(), context, &weights_buffer));
   } else {
     std::vector<half4> gpu_data(float_count / 4);
     RearrangeWeightsData(weights, absl::MakeSpan(gpu_data));
-    return CreateReadOnlyBuffer(float_size * float_count, gpu_data.data(),
-                                context, &weights_);
+    RETURN_IF_ERROR(CreateReadOnlyBuffer(
+        float_size * float_count, gpu_data.data(), context, &weights_buffer));
   }
+
+  args_.AddObject("weigths", AccessType::READ,
+                  absl::make_unique<Buffer>(std::move(weights_buffer)),
+                  absl::make_unique<BufferDescriptor>(desc));
+
+  return absl::OkStatus();
 }
 
 template <DataType S, typename T>
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.cc b/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.cc
index 363a0157420..c20cfdbeaa3 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.cc
@@ -859,12 +859,27 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
     conv_params.src_depth_loop_size = 1;
     conv_params.weights_upload_type = WeightsUploadType::GLOBAL_MEM;
   } else if (device.IsIntel()) {
+    if (different_weights_for_height) {
+      conv_params.work_group_size = int3(16, 1, 1);
+      conv_params.work_group_launch_order = int3(0, 1, 2);
+      conv_params.fixed_work_group_size = true;
+    } else {
+      conv_params.linear_hw = true;
+      conv_params.work_group_size = int3(16, 1, 1);
+      conv_params.work_group_launch_order = int3(0, 1, 2);
+      conv_params.fixed_work_group_size = true;
+    }
     conv_params.block_size = int3(1, 1, 4);
-    conv_params.work_group_size = int3(8, 2, 1);
-    conv_params.work_group_launch_order = int3(0, 1, 2);
-    conv_params.fixed_work_group_size = true;
     conv_params.src_depth_loop_size = 1;
-    conv_params.weights_upload_type = WeightsUploadType::LOCAL_MEM_BY_THREADS;
+    if (definition.precision != CalculationsPrecision::F32_F16 &&
+        device.SupportsExtension("cl_khr_subgroups") &&
+        device.SupportsExtension("cl_intel_required_subgroup_size") &&
+        device.IsCL20OrHigher() && device.SupportsSubGroupWithSize(16)) {
+      conv_params.weights_upload_type =
+          WeightsUploadType::PRIVATE_MEM_SIMD16_BROADCAST;
+    } else {
+      conv_params.weights_upload_type = WeightsUploadType::LOCAL_MEM_BY_THREADS;
+    }
     if (dst_depth % 4 == 0 || dst_depth >= 8) {
       conv_params.block_size.z = 4;
     } else if (dst_depth % 2 == 0 || dst_depth >= 4) {
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_texture.cc b/tensorflow/lite/delegates/gpu/cl/kernels/conv_texture.cc
index e92cc13706d..12765b11fa5 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_texture.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_texture.cc
@@ -31,18 +31,32 @@ namespace gpu {
 namespace cl {
 namespace {
 
-std::string GenerateConvCode(
-    const OperationDef& op_def, const int3& block_size, bool is1x1,
-    bool adreno4xx_optimization, bool stride_correction,
-    bool different_weights_for_height, const CLDevice& device,
-    const std::vector<ElementwiseOperation*>& linked_operations) {
-  std::string c = GetCommonDefines(op_def.precision);
-  TensorCodeGenerator src_tensor(
-      "src_data", WHSPoint{"src_size.x", "src_size.y", "src_size.z"},
-      op_def.src_tensors[0]);
-  TensorCodeGenerator dst_tensor(
-      "dst_data", WHSPoint{"dst_size.x", "dst_size.y", "dst_size.z"},
-      op_def.dst_tensors[0]);
+std::string GenerateConvCode(const OperationDef& op_def, const int3& block_size,
+                             bool is1x1, bool adreno4xx_optimization,
+                             bool stride_correction,
+                             bool different_weights_for_height,
+                             const CLDevice& device, Arguments* args) {
+  auto src_desc = absl::make_unique<TensorDescriptor>(op_def.src_tensors[0]);
+  src_desc->SetTextureAddressMode(GetFastestZeroMode(device));
+  if (op_def.IsBatchSupported()) {
+    src_desc->SetStateVar("BatchedWidth", "true");
+  }
+  args->AddObjectRef("src_tensor", AccessType::READ, std::move(src_desc));
+  auto dst_desc = absl::make_unique<TensorDescriptor>(op_def.dst_tensors[0]);
+  if (op_def.IsBatchSupported()) {
+    dst_desc->SetStateVar("BatchedWidth", "true");
+  }
+  args->AddObjectRef("dst_tensor", AccessType::WRITE, std::move(dst_desc));
+  if (!is1x1) {
+    args->AddInt("kernel_size_x");
+    args->AddInt("kernel_size_y");
+    args->AddInt("dilation_x");
+    args->AddInt("dilation_y");
+  }
+  args->AddInt("stride_x");
+  args->AddInt("stride_y");
+  args->AddInt("padding_x");
+  args->AddInt("padding_y");
 
   const auto src_tensor_type = op_def.src_tensors[0].storage_type;
   const bool is_buffer = src_tensor_type == TensorStorageType::IMAGE_BUFFER ||
@@ -63,6 +77,7 @@ std::string GenerateConvCode(
     zs[z] = std::to_string(z);
   }
 
+  std::string c = GetCommonDefines(op_def.precision);
   for (int z = 0; z < block_size.z; ++z) {
     const std::string f0 = std::to_string(z * 4 + 0);
     const std::string f1 = std::to_string(z * 4 + 1);
@@ -86,43 +101,29 @@ std::string GenerateConvCode(
   }
 
   c += "__kernel void main_function(\n";
-  c += src_tensor.GetDeclaration(AccessType::READ) + ",\n";
-  c += "    __read_only image2d_t filters0,   \n";
-  c += "    __read_only image2d_t filters1,   \n";
-  c += "    __read_only image2d_t filters2,   \n";
-  c += "    __read_only image2d_t filters3,   \n";
-  c += "    __read_only image2d_t biases";
-  c += GetArgsDeclaration(linked_operations);
-  c += dst_tensor.GetDeclaration(AccessType::WRITE) + ",\n";
-  c += "    int4 src_size,                   \n";
-  c += "    int4 dst_size,                   \n";
-  if (!is1x1) {
-    c += "    int2 kernel_size,              \n";
-    c += "    int2 dilation,                 \n";
-  }
-  c += "    int2 stride,                     \n";
-  c += "    int2 padding                     \n";
-  c += ") {\n";
+  c += "$0) {\n";
   c += "  int X = get_global_id(0) * " + std::to_string(block_size.x) + ";\n";
   c += "  int Y = get_global_id(1) * " + std::to_string(block_size.y) + ";\n";
   c += "  int Z = get_global_id(2) * " + std::to_string(block_size.z) + ";\n";
-  c += "  if (X >= dst_size.x || Y >= dst_size.y || Z >= dst_size.z) return;\n";
+  c += "  if (X >= args.dst_tensor.Width() || Y >= args.dst_tensor.Height() "
+       "|| Z >= args.dst_tensor.Slices()) return;\n";
   std::vector<std::string> s_x(block_size.x);
   std::vector<std::string> s_y(block_size.y);
   for (int x = 0; x < block_size.x; ++x) {
     if (stride_correction) {
       c += "  int xc" + xs[x] + " = " +
-           GetXStrideCorrected("X + " + xs[x], "src_size.w", "stride.x",
-                               "padding.x") +
+           GetXStrideCorrected("X + " + xs[x], "args.src_tensor.Batch()",
+                               "args.stride_x", "args.padding_x") +
            ";\n";
     } else {
       c += "  int xc" + xs[x] + " = (X +" + xs[x] +
-           ") * stride.x + padding.x;\n";
+           ") * args.stride_x + args.padding_x;\n";
     }
     s_x[x] = is1x1 ? "xc" + xs[x] : "cx" + xs[x];
   }
   for (int y = 0; y < block_size.y; ++y) {
-    c += "  int yc" + ys[y] + " = (Y +" + ys[y] + ") * stride.y + padding.y;\n";
+    c += "  int yc" + ys[y] + " = (Y +" + ys[y] +
+         ") * args.stride_y + args.padding_y;\n";
     s_y[y] = is1x1 ? "yc" + ys[y] : "cy" + ys[y];
   }
   for (int i = 0; i < block_size.x * block_size.y * block_size.z; ++i) {
@@ -131,7 +132,7 @@ std::string GenerateConvCode(
   }
   std::string f_y = is1x1 ? "s" : "filter_offset";
   if (different_weights_for_height) {
-    f_y = "Y * src_size.z + s";
+    f_y = "Y * args.src_tensor.Slices() + s";
   }
   if (!is1x1) {
     for (int x = 0; x < block_size.x; ++x) {
@@ -141,31 +142,31 @@ std::string GenerateConvCode(
       c += "  int cy" + ys[y] + ";\n";
     }
     c += "  int filter_offset = 0;\n";
-    c += "  for (int y = 0; y < kernel_size.y; ++y) {\n";
+    c += "  for (int y = 0; y < args.kernel_size_y; ++y) {\n";
     for (int y = 0; y < block_size.y; ++y) {
-      c += "  cy" + ys[y] + " = y * dilation.y + yc" + ys[y] + ";\n";
+      c += "  cy" + ys[y] + " = y * args.dilation_y + yc" + ys[y] + ";\n";
     }
     if (is_buffer) {
       for (int y = 0; y < block_size.y; ++y) {
         c += "  bool in_y" + ys[y] + " = cy" + ys[y] + " >= 0 && cy" + ys[y] +
-             " < src_size.y;\n";
+             " < args.src_tensor.Height();\n";
         if (src_tensor_type == TensorStorageType::BUFFER) {
           c += "    cy" + ys[y] + " = clamp(cy" + ys[y] +
-               ", 0, src_size.y - 1);\n";
+               ", 0, args.src_tensor.Height() - 1);\n";
         }
       }
     }
-    c += "  for (int x = 0; x < kernel_size.x; ++x) {\n";
+    c += "  for (int x = 0; x < args.kernel_size_x; ++x) {\n";
     for (int x = 0; x < block_size.x; ++x) {
-      c += "  cx" + xs[x] + " = x * dilation.x + xc" + xs[x] + ";\n";
+      c += "  cx" + xs[x] + " = x * args.dilation_x + xc" + xs[x] + ";\n";
     }
     if (is_buffer) {
       for (int x = 0; x < block_size.x; ++x) {
         c += "  bool in_x" + xs[x] + " = cx" + xs[x] + " >= 0 && cx" + xs[x] +
-             " < src_size.x;\n";
+             " < args.src_tensor.Width();\n";
         if (src_tensor_type == TensorStorageType::BUFFER) {
           c += "    cx" + xs[x] + " = clamp(cx" + xs[x] +
-               ", 0, src_size.x - 1);\n";
+               ", 0, args.src_tensor.Width() - 1);\n";
         }
       }
       for (int x = 0; x < block_size.x; ++x) {
@@ -173,90 +174,95 @@ std::string GenerateConvCode(
           const std::string id = std::to_string(y * block_size.x + x);
           if (src_tensor_type == TensorStorageType::IMAGE_BUFFER) {
             c += absl::Substitute(
-                "  int addr_$0 = select(-1, cy$2 * src_size.x + cx$1, (in_x$1 "
+                "  int addr_$0 = select(-1, cy$2 * args.src_tensor.Width() + "
+                "cx$1, (in_x$1 "
                 "&& "
                 "in_y$2));\n",
                 y * block_size.x + x, x, y);
             c += absl::Substitute(
-                "  int dz_$0 = select(0, src_size.x * src_size.y, (in_x$1 && "
+                "  int dz_$0 = select(0, args.src_tensor.Width() * "
+                "args.src_tensor.Height(), (in_x$1 && "
                 "in_y$2));\n",
                 y * block_size.x + x, x, y);
           } else {
-            c += absl::Substitute("  int addr_$0 = cy$2 * src_size.x + cx$1;\n",
-                                  y * block_size.x + x, x, y);
+            c += absl::Substitute(
+                "  int addr_$0 = cy$2 * args.src_tensor.Width() + cx$1;\n",
+                y * block_size.x + x, x, y);
           }
         }
       }
       if (src_tensor_type == TensorStorageType::BUFFER) {
-        c += "  int dz = src_size.x * src_size.y;\n";
+        c += "  int dz = args.src_tensor.Width() * args.src_tensor.Height();\n";
       }
     }
   } else if (is_buffer) {
     for (int y = 0; y < block_size.y; ++y) {
       c += "  bool in_y" + ys[y] + " = yc" + ys[y] + " >= 0 && yc" + ys[y] +
-           " < src_size.y;\n";
+           " < args.src_tensor.Height();\n";
     }
     for (int x = 0; x < block_size.x; ++x) {
       c += "  bool in_x" + xs[x] + " = xc" + xs[x] + " >= 0 && xc" + xs[x] +
-           " < src_size.x;\n";
+           " < args.src_tensor.Width();\n";
     }
     for (int x = 0; x < block_size.x; ++x) {
       for (int y = 0; y < block_size.y; ++y) {
         const std::string id = std::to_string(y * block_size.x + x);
         if (src_tensor_type == TensorStorageType::IMAGE_BUFFER) {
           c += absl::Substitute(
-              "  int addr_$0 = select(-1, yc$2 * src_size.x + xc$1, (in_x$1 && "
+              "  int addr_$0 = select(-1, yc$2 * args.src_tensor.Width() + "
+              "xc$1, (in_x$1 && "
               "in_y$2));\n",
               y * block_size.x + x, x, y);
           c += absl::Substitute(
-              "  int dz_$0 = select(0, src_size.x * src_size.y, (in_x$1 && "
+              "  int dz_$0 = select(0, args.src_tensor.Width() * "
+              "args.src_tensor.Height(), (in_x$1 && "
               "in_y$2));\n",
               y * block_size.x + x, x, y);
         } else {
-          c += absl::Substitute("  int addr_$0 = yc$2 * src_size.x + xc$1;\n",
-                                y * block_size.x + x, x, y);
+          c += absl::Substitute(
+              "  int addr_$0 = yc$2 * args.src_tensor.Width() + xc$1;\n",
+              y * block_size.x + x, x, y);
         }
       }
     }
     if (src_tensor_type == TensorStorageType::BUFFER) {
-      c += "  int dz = src_size.x * src_size.y;\n";
+      c += "  int dz = args.src_tensor.Width() * args.src_tensor.Height();\n";
     }
   }
-  c += "  for (int s = 0; s < src_size.z; ++s) {\n";
+  c += "  for (int s = 0; s < args.src_tensor.Slices(); ++s) {\n";
   if (is_buffer) {
     if (src_tensor_type == TensorStorageType::IMAGE_BUFFER) {
       for (int index = 0; index < block_size.x * block_size.y; ++index) {
         const std::string id = std::to_string(index);
         c +=
-            "    FLT4 src" + id + " = " + src_tensor.Read("addr_" + id) + ";\n";
+            "    FLT4 src" + id + " = args.src_tensor.Read(addr_" + id + ");\n";
       }
     } else {
       for (int x = 0; x < block_size.x; ++x) {
         for (int y = 0; y < block_size.y; ++y) {
           const std::string id = std::to_string(y * block_size.x + x);
-          c += "    FLT4 src" + id + " = " + src_tensor.Read("addr_" + id) +
-               " * (FLT)(in_x" + xs[x] + " && in_y" + ys[y] + "); addr_" + id +
+          c += "    FLT4 src" + id + " = args.src_tensor.Read(addr_" + id +
+               ") * (FLT)(in_x" + xs[x] + " && in_y" + ys[y] + "); addr_" + id +
                " += dz;\n";
         }
       }
     }
   }
   for (int z = 0; z < block_size.z; ++z) {
-    const std::string fc = "(int2)(Z + " + zs[z] + ", " + f_y + ")";
-    c += absl::Substitute(R"(    FLT4 f$1 = READ_IMAGE(filters0, smp_none, $0);
-    FLT4 f$2 = READ_IMAGE(filters1, smp_none, $0);
-    FLT4 f$3 = READ_IMAGE(filters2, smp_none, $0);
-    FLT4 f$4 = READ_IMAGE(filters3, smp_none, $0);
+    c += absl::Substitute(R"(    FLT4 f$2 = args.weights0.Read($0, $1);
+    FLT4 f$3 = args.weights1.Read($0, $1);
+    FLT4 f$4 = args.weights2.Read($0, $1);
+    FLT4 f$5 = args.weights3.Read($0, $1);
 )",
-                          fc, z * 4 + 0, z * 4 + 1, z * 4 + 2, z * 4 + 3);
+                          "Z + " + zs[z], f_y, z * 4 + 0, z * 4 + 1, z * 4 + 2,
+                          z * 4 + 3);
   }
   if (!is_buffer) {
-    const auto mode = GetFastestZeroMode(device);
     for (int x = 0; x < block_size.x; ++x) {
       for (int y = 0; y < block_size.y; ++y) {
         const std::string id = std::to_string(y * block_size.x + x);
-        c += "    FLT4 src" + id + " = " +
-             src_tensor.ReadWHS(s_x[x], s_y[y], "s", mode) + ";\n";
+        c += "    FLT4 src" + id + " = args.src_tensor.Read(" + s_x[x] + ", " +
+             s_y[y] + ", s);\n";
       }
     }
   }
@@ -278,17 +284,17 @@ std::string GenerateConvCode(
       }
     }
   }
-  c += "  }\n";  // src_size.z
+  c += "  }\n";  // args.src_tensor.Slices()
   if (!is1x1) {
-    c += "  }\n";  // kernel_size.x
-    c += "  }\n";  // kernel_size.y
+    c += "  }\n";  // kernel_size_x
+    c += "  }\n";  // kernel_size_y
   }
   // when is1x1 && adreno4xx_optimization is true, xc0 == X and yc0 == Y
   std::string dst_x = is1x1 && adreno4xx_optimization ? "xc0" : "X";
   std::string dst_y = is1x1 && adreno4xx_optimization ? "yc0" : "Y";
   for (int z = 0; z < block_size.z; ++z) {
-    c += "  if (Z < dst_size.z) {\n";
-    c += "    FLT4 bias_val = READ_IMAGE(biases, smp_none, (int2)(Z, 0));\n";
+    c += "  if (Z < args.dst_tensor.Slices()) {\n";
+    c += "    FLT4 bias_val = args.biases.Read(Z);\n";
     for (int y = 0; y < block_size.y; ++y) {
       for (int x = 0; x < block_size.x; ++x) {
         const std::string id =
@@ -296,11 +302,10 @@ std::string GenerateConvCode(
         c += "    {\n";
         c += "      int xc = " + dst_x + " + " + xs[x] + ";\n";
         c += "      int yc = " + dst_y + " + " + ys[y] + ";\n";
-        c += "      if (xc < dst_size.x && yc < dst_size.y) {\n";
+        c += "      if (xc < args.dst_tensor.Width() && yc < "
+             "args.dst_tensor.Height()) {\n";
         c += "        FLT4 res = TO_FLT4(r" + id + ") + bias_val;\n";
-        const LinkingContext context{"res", "xc", "yc", "Z"};
-        c += PostProcess(linked_operations, context);
-        c += "        " + dst_tensor.WriteWHS("res", "xc", "yc", "Z") + "\n";
+        c += "        args.dst_tensor.Write(res, xc, yc, Z);\n";
         c += "      }\n";
         c += "    }\n";
       }
@@ -350,11 +355,6 @@ ConvTexture::ConvTexture(const OperationDef& definition)
 
 ConvTexture::ConvTexture(ConvTexture&& operation)
     : GPUOperation(std::move(operation)),
-      weights_0_(std::move(operation.weights_0_)),
-      weights_1_(std::move(operation.weights_1_)),
-      weights_2_(std::move(operation.weights_2_)),
-      weights_3_(std::move(operation.weights_3_)),
-      biases_(std::move(operation.biases_)),
       kernel_size_(operation.kernel_size_),
       stride_(operation.stride_),
       padding_(operation.padding_),
@@ -366,11 +366,6 @@ ConvTexture::ConvTexture(ConvTexture&& operation)
 
 ConvTexture& ConvTexture::operator=(ConvTexture&& operation) {
   if (this != &operation) {
-    weights_0_ = std::move(operation.weights_0_);
-    weights_1_ = std::move(operation.weights_1_);
-    weights_2_ = std::move(operation.weights_2_);
-    weights_3_ = std::move(operation.weights_3_);
-    biases_ = std::move(operation.biases_);
     std::swap(kernel_size_, operation.kernel_size_);
     std::swap(stride_, operation.stride_);
     std::swap(padding_, operation.padding_);
@@ -395,10 +390,16 @@ absl::Status ConvTexture::Compile(const CreationContext& creation_context) {
       definition_.precision == CalculationsPrecision::F16;
   const bool stride_correction =
       definition_.IsBatchSupported() && stride_.x != 1;
-  const std::string code =
+  std::string code =
       GenerateConvCode(definition_, block_size_, is1x1, adreno4xx_optimization,
                        stride_correction, different_weights_for_height_,
-                       *creation_context.device, linked_operations_);
+                       *creation_context.device, &args_);
+  std::string element_wise_code;
+  RETURN_IF_ERROR(
+      MergeOperations(linked_operations_, &args_, &element_wise_code));
+  RETURN_IF_ERROR(args_.TransformToCLCode(creation_context.device->GetInfo(),
+                                          {{"dst_tensor", element_wise_code}},
+                                          &code));
   std::vector<CompilerOptions> options;
   if (UseFP16SIMD(*creation_context.device, definition_.precision, is1x1)) {
     options.push_back(CompilerOptions::ADRENO_FULL_SIMD_LINE);
@@ -409,25 +410,20 @@ absl::Status ConvTexture::Compile(const CreationContext& creation_context) {
 }
 
 absl::Status ConvTexture::BindArguments() {
-  kernel_.ResetBindingCounter();
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(weights_0_.GetMemoryPtr()));
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(weights_1_.GetMemoryPtr()));
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(weights_2_.GetMemoryPtr()));
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(weights_3_.GetMemoryPtr()));
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(biases_.GetMemoryPtr()));
-  RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtrForWriting()));
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetWBatchedHSB()));
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetWBatchedHSB()));
+  RETURN_IF_ERROR(args_.SetObjectRef("src_tensor", src_[0]));
+  RETURN_IF_ERROR(args_.SetObjectRef("dst_tensor", dst_[0]));
   if (!(kernel_size_.x == 1 && kernel_size_.y == 1)) {
-    RETURN_IF_ERROR(kernel_.SetBytesAuto(kernel_size_));
-    RETURN_IF_ERROR(kernel_.SetBytesAuto(
-        int2(dilation_.x * src_[0]->Batch(), dilation_.y)));
+    RETURN_IF_ERROR(args_.SetInt("kernel_size_x", kernel_size_.x));
+    RETURN_IF_ERROR(args_.SetInt("kernel_size_y", kernel_size_.y));
+    RETURN_IF_ERROR(args_.SetInt("dilation_x", dilation_.x * src_[0]->Batch()));
+    RETURN_IF_ERROR(args_.SetInt("dilation_y", dilation_.y));
   }
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(stride_));
-  RETURN_IF_ERROR(
-      kernel_.SetBytesAuto(int2(padding_.x * src_[0]->Batch(), padding_.y)));
+  RETURN_IF_ERROR(args_.SetInt("stride_x", stride_.x));
+  RETURN_IF_ERROR(args_.SetInt("stride_y", stride_.y));
+  RETURN_IF_ERROR(args_.SetInt("padding_x", padding_.x * src_[0]->Batch()));
+  RETURN_IF_ERROR(args_.SetInt("padding_y", padding_.y));
+  RETURN_IF_ERROR(SetArguments(linked_operations_, &args_));
+  RETURN_IF_ERROR(args_.Bind(kernel_.kernel()));
   return absl::OkStatus();
 }
 
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_texture.h b/tensorflow/lite/delegates/gpu/cl/kernels/conv_texture.h
index 42f7ecd51af..31c2a72021e 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_texture.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_texture.h
@@ -92,12 +92,6 @@ class ConvTexture : public GPUOperation {
   absl::Status BindArguments();
   int3 GetGridSize() const;
 
-  Texture2D weights_0_;
-  Texture2D weights_1_;
-  Texture2D weights_2_;
-  Texture2D weights_3_;
-  LinearStorage biases_;
-
   int2 kernel_size_;
   int2 stride_;
   int2 padding_;
@@ -119,11 +113,16 @@ absl::Status ConvTexture::UploadData(
     const tflite::gpu::Tensor<OHWI, T>& weights,
     const tflite::gpu::Tensor<Linear, T>& biases, CLContext* context) {
   RETURN_IF_ERROR(UploadWeights(weights, context));
-  LinearStorageCreateInfo create_info;
-  create_info.storage_type = LinearStorageType::TEXTURE_2D;
-  create_info.data_type = definition_.GetDataType();
-  create_info.aligned_size = weights.shape.o;
-  RETURN_IF_ERROR(CreateLinearStorage(create_info, biases, context, &biases_));
+
+  TensorLinearDescriptor desc;
+  desc.storage_type = LinearStorageType::TEXTURE_2D;
+  desc.element_type = definition_.GetDataType();
+
+  LinearStorage lt;
+  RETURN_IF_ERROR(CreateLinearStorage(desc, biases, context, &lt));
+  args_.AddObject("biases", AccessType::READ,
+                  absl::make_unique<LinearStorage>(std::move(lt)),
+                  absl::make_unique<TensorLinearDescriptor>(desc));
   return absl::OkStatus();
 }
 
@@ -135,14 +134,19 @@ absl::Status ConvTexture::UploadDataForWinograd4x4To6x6(
   RearrangeWeightsToWinograd4x4To6x6Weights(weights, &wino_weights);
   RETURN_IF_ERROR(UploadWeights(wino_weights, context));
 
-  LinearStorageCreateInfo create_info;
-  create_info.storage_type = LinearStorageType::TEXTURE_2D;
-  create_info.data_type = definition_.GetDataType();
-  create_info.aligned_size = 1;
   tflite::gpu::Tensor<Linear, DataType::FLOAT32> bias;
   bias.shape = Linear(1);
   bias.data = {0.0f};
-  return CreateLinearStorage(create_info, bias, context, &biases_);
+  TensorLinearDescriptor desc;
+  desc.storage_type = LinearStorageType::TEXTURE_2D;
+  desc.element_type = definition_.GetDataType();
+
+  LinearStorage lt;
+  RETURN_IF_ERROR(CreateLinearStorage(desc, bias, context, &lt));
+  args_.AddObject("biases", AccessType::READ,
+                  absl::make_unique<LinearStorage>(std::move(lt)),
+                  absl::make_unique<TensorLinearDescriptor>(desc));
+  return absl::OkStatus();
 }
 
 template <DataType T>
@@ -157,11 +161,20 @@ absl::Status ConvTexture::UploadWeights(
   int texture_width = dst_depth;
   int texture_height = src_depth * kernel_x * kernel_y;
 
-  DataType data_type = definition_.GetDataType();
+  const bool f32_weights = definition_.precision == CalculationsPrecision::F32;
+  DataType data_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
 
   const int elements_count = texture_width * texture_height;
 
-  if (data_type == DataType::FLOAT32) {
+  Texture2DDescriptor desc;
+  desc.element_type = data_type;
+
+  Texture2D weights_0;
+  Texture2D weights_1;
+  Texture2D weights_2;
+  Texture2D weights_3;
+
+  if (f32_weights) {
     std::vector<float4> gpu_data_0(elements_count);
     std::vector<float4> gpu_data_1(elements_count);
     std::vector<float4> gpu_data_2(elements_count);
@@ -171,15 +184,16 @@ absl::Status ConvTexture::UploadWeights(
                          absl::MakeSpan(gpu_data_3));
     RETURN_IF_ERROR(CreateTexture2DRGBA(data_type, texture_width,
                                         texture_height, gpu_data_0.data(),
-                                        context, &weights_0_));
+                                        context, &weights_0));
     RETURN_IF_ERROR(CreateTexture2DRGBA(data_type, texture_width,
                                         texture_height, gpu_data_1.data(),
-                                        context, &weights_1_));
+                                        context, &weights_1));
     RETURN_IF_ERROR(CreateTexture2DRGBA(data_type, texture_width,
                                         texture_height, gpu_data_2.data(),
-                                        context, &weights_2_));
-    return CreateTexture2DRGBA(data_type, texture_width, texture_height,
-                               gpu_data_3.data(), context, &weights_3_);
+                                        context, &weights_2));
+    RETURN_IF_ERROR(CreateTexture2DRGBA(data_type, texture_width,
+                                        texture_height, gpu_data_3.data(),
+                                        context, &weights_3));
   } else {
     std::vector<half4> gpu_data_0(elements_count);
     std::vector<half4> gpu_data_1(elements_count);
@@ -190,16 +204,31 @@ absl::Status ConvTexture::UploadWeights(
                          absl::MakeSpan(gpu_data_3));
     RETURN_IF_ERROR(CreateTexture2DRGBA(data_type, texture_width,
                                         texture_height, gpu_data_0.data(),
-                                        context, &weights_0_));
+                                        context, &weights_0));
     RETURN_IF_ERROR(CreateTexture2DRGBA(data_type, texture_width,
                                         texture_height, gpu_data_1.data(),
-                                        context, &weights_1_));
+                                        context, &weights_1));
     RETURN_IF_ERROR(CreateTexture2DRGBA(data_type, texture_width,
                                         texture_height, gpu_data_2.data(),
-                                        context, &weights_2_));
-    return CreateTexture2DRGBA(data_type, texture_width, texture_height,
-                               gpu_data_3.data(), context, &weights_3_);
+                                        context, &weights_2));
+    RETURN_IF_ERROR(CreateTexture2DRGBA(data_type, texture_width,
+                                        texture_height, gpu_data_3.data(),
+                                        context, &weights_3));
   }
+
+  args_.AddObject("weights0", AccessType::READ,
+                  absl::make_unique<Texture2D>(std::move(weights_0)),
+                  absl::make_unique<Texture2DDescriptor>(desc));
+  args_.AddObject("weights1", AccessType::READ,
+                  absl::make_unique<Texture2D>(std::move(weights_1)),
+                  absl::make_unique<Texture2DDescriptor>(desc));
+  args_.AddObject("weights2", AccessType::READ,
+                  absl::make_unique<Texture2D>(std::move(weights_2)),
+                  absl::make_unique<Texture2DDescriptor>(desc));
+  args_.AddObject("weights3", AccessType::READ,
+                  absl::make_unique<Texture2D>(std::move(weights_3)),
+                  absl::make_unique<Texture2DDescriptor>(desc));
+  return absl::OkStatus();
 }
 
 template <DataType S, typename T>
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_weights_converter.cc b/tensorflow/lite/delegates/gpu/cl/kernels/conv_weights_converter.cc
index 71559ab587a..18a6886dc89 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_weights_converter.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_weights_converter.cc
@@ -26,51 +26,51 @@ namespace cl {
 namespace {
 
 std::string GetConverterToConvWeightsCode(
-    const OperationDef& op_def,
-    const ConvWeightsDescription& conv_weights_desc) {
-  TensorCodeGenerator src_tensor(
-      "src_data",
-      WHSBPoint{"src_size.x", "src_size.y", "src_size.z", "src_size.w"},
-      op_def.src_tensors[0]);
-  TensorCodeGenerator dst_tensor(
-      "dst_data",
-      WHSBPoint{"dst_size.x", "dst_size.y", "dst_size.z", "dst_size.w"},
-      op_def.dst_tensors[0]);
+    const OperationDef& op_def, const ConvWeightsDescription& conv_weights_desc,
+    Arguments* args) {
+  args->AddObjectRef(
+      "src_tensor", AccessType::READ,
+      absl::make_unique<TensorDescriptor>(op_def.src_tensors[0]));
+  args->AddObjectRef(
+      "dst_tensor", AccessType::WRITE,
+      absl::make_unique<TensorDescriptor>(op_def.dst_tensors[0]));
+  args->AddFloat("mask_x");
+  args->AddFloat("mask_y");
+  args->AddFloat("mask_z");
+  args->AddFloat("mask_w");
 
   std::string c = GetCommonDefines(op_def.precision);
   c += "__kernel void main_function(\n";
-  c += src_tensor.GetDeclaration(AccessType::READ) + ",\n";
-  c += dst_tensor.GetDeclaration(AccessType::WRITE) + ",\n";
-  c += "    int4 src_size,              \n";
-  c += "    float4 mask\n";
-  c += ") {\n";
+  c += "$0) {\n";
   c += "  int GROUP_SIZE = " +
        std::to_string(conv_weights_desc.output_group_size) + ";\n";
   c += "  int O = get_global_id(0) * 4;\n";
   c += "  int I = get_global_id(1);\n";
   c += "  int Z = get_global_id(2);\n";
-  c += "  int W = Z % src_size.x;\n";
-  c += "  int H = Z / src_size.x;\n";
-  c += "  if (O >= src_size.w || I >= src_size.z || H >= src_size.y) return;\n";
-  c += "  FLT4 v0 =" + src_tensor.ReadWHSB("W", "H", "I", "O + 0") + ";\n";
+  c += "  int W = Z % args.src_tensor.Width();\n";
+  c += "  int H = Z / args.src_tensor.Width();\n";
+  c += "  if (O >= args.src_tensor.Batch() || I >= args.src_tensor.Slices() || "
+       "H >= args.src_tensor.Height()) return;\n";
+  c += "  FLT4 v0 = args.src_tensor.Read(W, H, I, O + 0);\n";
   c += "  FLT4 v1 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);\n";
   c += "  FLT4 v2 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);\n";
   c += "  FLT4 v3 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);\n";
-  c += "  if (O + 1 < src_size.w) {\n";
-  c += "    v1 =" + src_tensor.ReadWHSB("W", "H", "I", "O + 1") + ";\n";
+  c += "  if (O + 1 < args.src_tensor.Batch()) {\n";
+  c += "    v1 = args.src_tensor.Read(W, H, I, O + 1);\n";
   c += "  }\n";
-  c += "  if (O + 2 < src_size.w) {\n";
-  c += "    v2 =" + src_tensor.ReadWHSB("W", "H", "I", "O + 2") + ";\n";
+  c += "  if (O + 2 < args.src_tensor.Batch()) {\n";
+  c += "    v2 = args.src_tensor.Read(W, H, I, O + 2);\n";
   c += "  }\n";
-  c += "  if (O + 3 < src_size.w) {\n";
-  c += "    v3 =" + src_tensor.ReadWHSB("W", "H", "I", "O + 3") + ";\n";
+  c += "  if (O + 3 < args.src_tensor.Batch()) {\n";
+  c += "    v3 = args.src_tensor.Read(W, H, I, O + 3);\n";
   c += "  }\n";
-  c += "  if (I == src_size.z - 1) {\n";
-  c += "    FLT4 mask_t = TO_FLT4(mask);\n";
-  c += "    v0 *= mask_t;\n";
-  c += "    v1 *= mask_t;\n";
-  c += "    v2 *= mask_t;\n";
-  c += "    v3 *= mask_t;\n";
+  c += "  if (I == args.src_tensor.Slices() - 1) {\n";
+  c += "    FLT4 mask = (FLT4)(args.mask_x, args.mask_y, args.mask_z, "
+       "args.mask_w);\n";
+  c += "    v0 *= mask;\n";
+  c += "    v1 *= mask;\n";
+  c += "    v2 *= mask;\n";
+  c += "    v3 *= mask;\n";
   c += "  }\n";
   c += "  FLT4 r0 = (FLT4)(v0.x, v1.x, v2.x, v3.x);\n";
   c += "  FLT4 r1 = (FLT4)(v0.y, v1.y, v2.y, v3.y);\n";
@@ -78,17 +78,18 @@ std::string GetConverterToConvWeightsCode(
   c += "  FLT4 r3 = (FLT4)(v0.w, v1.w, v2.w, v3.w);\n";
   c += "  int d_index = O / (GROUP_SIZE * 4);\n";
   c += "  int k_index = (O % (GROUP_SIZE * 4)) / 4;\n";
-  c += "  int dst_offset = (((d_index * src_size.y + H) * src_size.x + W) * "
-       "src_size.z + I) * GROUP_SIZE + "
+  c += "  int dst_offset = (((d_index * args.src_tensor.Height() + H) * "
+       "args.src_tensor.Width() + W) * "
+       "args.src_tensor.Slices() + I) * GROUP_SIZE + "
        "k_index;\n";
   c += "  int address0 = dst_offset * 4 + 0;\n";
   c += "  int address1 = dst_offset * 4 + 1;\n";
   c += "  int address2 = dst_offset * 4 + 2;\n";
   c += "  int address3 = dst_offset * 4 + 3;\n";
-  c += "  " + dst_tensor.Write("r0", "address0");
-  c += "  " + dst_tensor.Write("r1", "address1");
-  c += "  " + dst_tensor.Write("r2", "address2");
-  c += "  " + dst_tensor.Write("r3", "address3");
+  c += "  args.dst_tensor.WriteLinear(r0, dst_offset * 4 + 0)\n;";
+  c += "  args.dst_tensor.WriteLinear(r1, dst_offset * 4 + 1)\n;";
+  c += "  args.dst_tensor.WriteLinear(r2, dst_offset * 4 + 2)\n;";
+  c += "  args.dst_tensor.WriteLinear(r3, dst_offset * 4 + 3)\n;";
   c += "}\n";
   return c;
 }
@@ -115,20 +116,24 @@ ConverterToConvWeights& ConverterToConvWeights::operator=(
 absl::Status ConverterToConvWeights::Compile(
     const CreationContext& creation_context) {
   std::string code =
-      GetConverterToConvWeightsCode(definition_, conv_weights_desc_);
+      GetConverterToConvWeightsCode(definition_, conv_weights_desc_, &args_);
+  RETURN_IF_ERROR(
+      args_.TransformToCLCode(creation_context.device->GetInfo(), {}, &code));
   return creation_context.cache->GetOrCreateCLKernel(
       code, "main_function", *creation_context.context,
       *creation_context.device, &kernel_);
 }
 
 absl::Status ConverterToConvWeights::BindArguments() {
-  kernel_.ResetBindingCounter();
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtrForWriting()));
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetWHSB()));
-  RETURN_IF_ERROR(
-      kernel_.SetBytesAuto(GetMaskForLastPlane(src_[0]->Channels())));
-  return absl::OkStatus();
+  RETURN_IF_ERROR(args_.SetObjectRef("src_tensor", src_[0]));
+  RETURN_IF_ERROR(args_.SetObjectRef("dst_tensor", dst_[0]));
+  float4 mask = GetMaskForLastPlane(src_[0]->Channels());
+  RETURN_IF_ERROR(args_.SetFloat("mask_x", mask.x));
+  RETURN_IF_ERROR(args_.SetFloat("mask_y", mask.y));
+  RETURN_IF_ERROR(args_.SetFloat("mask_z", mask.z));
+  RETURN_IF_ERROR(args_.SetFloat("mask_w", mask.w));
+  RETURN_IF_ERROR(SetArguments(linked_operations_, &args_));
+  return args_.Bind(kernel_.kernel());
 }
 
 int3 ConverterToConvWeights::GetGridSize() const {
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3.cc b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3.cc
index 4a68eda1d95..9b028721d2d 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3.cc
@@ -269,7 +269,7 @@ ConvolutionTransposed3x3::ConvolutionTransposed3x3(
       work_group_launch_order_(2, 0, 1) {
   if (device.IsPowerVR()) {
     weights_upload_type_ = WeightsUploadType::LOCAL_MEM_ASYNC;
-  } else if (device.IsNvidia()) {
+  } else if (device.IsNvidia() || device.IsIntel()) {
     weights_upload_type_ = WeightsUploadType::LOCAL_MEM_BY_THREADS;
   } else if (device.IsAMD()) {
     weights_upload_type_ = WeightsUploadType::CONSTANT_MEM;
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_4x4.cc b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_4x4.cc
index 0f7f90989e8..209b675087e 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_4x4.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_4x4.cc
@@ -270,7 +270,7 @@ ConvolutionTransposed4x4::ConvolutionTransposed4x4(
     : GPUOperation(definition) {
   if (device.IsPowerVR()) {
     weights_upload_type_ = WeightsUploadType::LOCAL_MEM_ASYNC;
-  } else if (device.IsNvidia()) {
+  } else if (device.IsNvidia() || device.IsIntel()) {
     weights_upload_type_ = WeightsUploadType::LOCAL_MEM_BY_THREADS;
   } else if (device.IsAMD()) {
     weights_upload_type_ = WeightsUploadType::CONSTANT_MEM;
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv.cc b/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv.cc
index 4c5e20abde3..de1a04befa8 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv.cc
@@ -34,57 +34,71 @@ bool IsSpecializedCase(int channel_multiplier) {
          channel_multiplier == 4;
 }
 
-std::string GetSrcValue(const TensorCodeGenerator& src_tensor,
-                        int channel_multiplier,
-                        TextureAddressMode address_mode) {
+std::string GetSrcValue(int channel_multiplier, const std::string coords) {
   std::string c;
   if (channel_multiplier == 1) {
-    c += "      FLT4 src_final =" +
-         src_tensor.ReadWHS("x_c", "y_c", "Z", address_mode) + ";\n";
+    c += "      FLT4 src_final = args.src_tensor.Read(" + coords + ", S);\n";
   } else if (channel_multiplier == 2) {
-    c += "      int z_layer = Z / 2;\n";
-    c += "      FLT4 src =" +
-         src_tensor.ReadWHS("x_c", "y_c", "z_layer", address_mode) + ";\n";
-    c += "      FLT2 t0 = Z % 2 == 0 ? src.xy : src.zw;\n";
+    c += "      int s_layer = S / 2;\n";
+    c += "      FLT4 src = args.src_tensor.Read(" + coords + ", s_layer);\n";
+    c += "      FLT2 t0 = S % 2 == 0 ? src.xy : src.zw;\n";
     c += "      FLT4 src_final = (FLT4)(t0.x, t0.x, t0.y, t0.y);\n";
   } else if (channel_multiplier == 4) {
-    c += "      int z_layer = Z / 4;\n";
-    c += "      FLT4 src =" +
-         src_tensor.ReadWHS("x_c", "y_c", "z_layer", address_mode) + ";\n";
+    c += "      int s_layer = S / 4;\n";
+    c += "      FLT4 src = args.src_tensor.Read(" + coords + ", s_layer);\n";
     c += "      FLT t0 = src.x;\n";
-    c += "      int reminder = Z % 4;\n";
+    c += "      int reminder = S % 4;\n";
     c += "      if (reminder == 1) t0 = src.y;\n";
     c += "      if (reminder == 2) t0 = src.z;\n";
     c += "      if (reminder == 3) t0 = src.w;\n";
     c += "      FLT4 src_final = (FLT4)(t0, t0, t0, t0);\n";
   } else {
-    c += "      int z_layer = Z / channel_multiplier;\n";
-    c += "      FLT4 src =" +
-         src_tensor.ReadWHS("x_c", "y_c", "z_layer", address_mode) + ";\n";
-    c += "      int z_offset = (Z % channel_multiplier) * 4;\n";
+    c += "      int s_layer = S / args.ch_multiplier;\n";
+    c += "      FLT4 src = args.src_tensor.Read(" + coords + ", s_layer);\n";
+    c += "      int s_offset = (S % args.ch_multiplier) * 4;\n";
     c += "      FLT4 src_final;\n";
     c += "      FLT temp_arr[4] = {src.x, src.y, src.z, src.w};\n";
-    c += "      src_final.x = temp_arr[(z_offset + 0) / channel_multiplier];\n";
-    c += "      src_final.y = temp_arr[(z_offset + 1) / channel_multiplier];\n";
-    c += "      src_final.z = temp_arr[(z_offset + 2) / channel_multiplier];\n";
-    c += "      src_final.w = temp_arr[(z_offset + 3) / channel_multiplier];\n";
+    c += "      src_final.x = temp_arr[(s_offset + 0) / args.ch_multiplier];\n";
+    c += "      src_final.y = temp_arr[(s_offset + 1) / args.ch_multiplier];\n";
+    c += "      src_final.z = temp_arr[(s_offset + 2) / args.ch_multiplier];\n";
+    c += "      src_final.w = temp_arr[(s_offset + 3) / args.ch_multiplier];\n";
   }
 
   return c;
 }
 
 std::string GenerateDepthwiseConvolutionCode(
-    const OperationDef& op_def, bool stride_correction,
-    const LinearStorage& biases, int channel_multiplier,
-    bool weights_are_buffer,
-    const std::vector<ElementwiseOperation*>& linked_operations,
-    const CLDevice& device) {
-  TensorCodeGenerator src_tensor(
-      "src_data", WHSPoint{"src_size.x", "src_size.y", "src_size.z"},
-      op_def.src_tensors[0]);
-  TensorCodeGenerator dst_tensor(
-      "dst_data", WHSPoint{"dst_size.x", "dst_size.y", "dst_size.z"},
-      op_def.dst_tensors[0]);
+    const OperationDef& op_def, bool stride_correction, int channel_multiplier,
+    bool weights_are_buffer, const CLDevice& device, Arguments* args) {
+  auto src_desc = absl::make_unique<TensorDescriptor>(op_def.src_tensors[0]);
+  src_desc->SetTextureAddressMode(GetFastestZeroMode(device));
+  if (op_def.IsBatchSupported()) {
+    src_desc->SetStateVar("BatchedWidth", "true");
+  }
+  args->AddObjectRef("src_tensor", AccessType::READ, std::move(src_desc));
+  auto dst_desc = absl::make_unique<TensorDescriptor>(op_def.dst_tensors[0]);
+  if (op_def.IsBatchSupported()) {
+    dst_desc->SetStateVar("BatchedWidth", "true");
+  }
+  args->AddObjectRef("dst_tensor", AccessType::WRITE, std::move(dst_desc));
+  args->AddInt("kernel_size_x");
+  args->AddInt("stride_x");
+  args->AddInt("padding_x");
+  args->AddInt("dilation_x");
+  args->AddInt("kernel_size_y");
+  args->AddInt("stride_y");
+  args->AddInt("padding_y");
+  args->AddInt("dilation_y");
+  if (op_def.dst_tensors[0].HasAxis(Axis::DEPTH)) {
+    args->AddInt("kernel_size_z");
+    args->AddInt("stride_z");
+    args->AddInt("padding_z");
+    args->AddInt("dilation_z");
+  }
+  if (!IsSpecializedCase(channel_multiplier)) {
+    args->AddInt("ch_multiplier");
+  }
+
   const auto src_tensor_type = op_def.src_tensors[0].storage_type;
 
   std::string c = GetCommonDefines(op_def.precision);
@@ -93,86 +107,110 @@ std::string GenerateDepthwiseConvolutionCode(
                             src_tensor_type == TensorStorageType::IMAGE_BUFFER;
 
   c += "__kernel void main_function(\n";
-  c += src_tensor.GetDeclaration(AccessType::READ) + ",\n";
-  if (weights_are_buffer) {
-    c += "    __global FLT4* filters,  \n";
-  } else {
-    c += "    __read_only image2d_t filters,  \n";
-  }
-  c += biases.GetDeclaration();
-  c += GetArgsDeclaration(linked_operations);
-  c += dst_tensor.GetDeclaration(AccessType::WRITE) + ",\n";
-  c += "    int2 kernel_size,                \n";
-  c += "    int2 stride,                     \n";
-  c += "    int2 padding,                    \n";
-  c += "    int2 dilation,                   \n";
-  if (!IsSpecializedCase(channel_multiplier)) {
-    c += "    int channel_multiplier,            \n";
-  }
-  c += "    int4 src_size,                   \n";
-  c += "    int4 dst_size                    \n";
-  c += ") {\n";
+  c += "$0) {\n";
   c += "  int X = get_global_id(0);\n";
   c += "  int Y = get_global_id(1);\n";
-  c += "  int Z = get_global_id(2);\n";
-  c += "  if (X >= dst_size.x || Y >= dst_size.y || Z >= dst_size.z) return;\n";
+  if (op_def.dst_tensors[0].HasAxis(Axis::DEPTH)) {
+    c += "  int linear_id_2 = get_global_id(2);\n";
+    c += "  int S = linear_id_2 / args.dst_tensor.Depth();\n";
+    c += "  int Z = linear_id_2 % args.dst_tensor.Depth();\n";
+  } else {
+    c += "  int S = get_global_id(2);\n";
+  }
+  c += "  if (X >= args.dst_tensor.Width() || Y >= args.dst_tensor.Height() || "
+       "S >= args.dst_tensor.Slices()) { \n";
+  c += "    return; \n";
+  c += "  } \n";
   c += "  ACCUM_FLT4 r = (ACCUM_FLT4)(0.0f, 0.0f, 0.0f, 0.0f);\n";
   if (stride_correction) {
     c += "  int x_offseted = " +
-         GetXStrideCorrected("X", "src_size.w", "stride.x", "padding.x") +
+         GetXStrideCorrected("X", "args.src_tensor.Batch()", "args.stride_x",
+                             "args.padding_x") +
          ";\n";
   } else {
-    c += "  int x_offseted = X * stride.x + padding.x;\n";
+    c += "  int x_offseted = X * args.stride_x + args.padding_x;\n";
+  }
+  c += "  int y_offseted = Y * args.stride_y + args.padding_y;\n";
+  std::string weights_offset = "args.kernel_size_x * args.kernel_size_y";
+  if (op_def.dst_tensors[0].HasAxis(Axis::DEPTH)) {
+    c += "  int z_offseted = Z * args.stride_z + args.padding_z;\n";
+    weights_offset += " * args.kernel_size_z";
   }
-  c += "  int y_offseted = Y * stride.y + padding.y;\n";
   if (weights_are_buffer) {
-    c += "  int fx_c = Z * kernel_size.x * kernel_size.y;\n";
+    c += "  int fx_c = S * " + weights_offset + ";\n";
   } else {
     c += "  int fx_c = 0;\n";
   }
 
+  std::string flat_coords = "x_c, y_c";
   if (manual_clamp) {
-    c += "  for (int ky = 0; ky < kernel_size.y; ++ky) {\n";
-    c += "    int y_c = y_offseted + ky * dilation.y;\n";
-    c += "    bool outside_y = y_c < 0 || y_c >= src_size.y;\n";
-    c += "    for (int kx = 0; kx < kernel_size.x; ++kx) {\n";
-    c += "      int x_c = x_offseted + kx * dilation.x;\n";
-    c += "      bool outside_x = x_c < 0 || x_c >= src_size.x;\n";
-    c += "      if (!outside_x && !outside_y) {\n";
-    if (weights_are_buffer) {
-      c += "        FLT4 f = filters[fx_c];\n";
-    } else {
-      c += "        FLT4 f = READ_IMAGE(filters, smp_none, (int2)(fx_c, Z));\n";
+    std::string check = "!outside_x && !outside_y";
+    if (op_def.dst_tensors[0].HasAxis(Axis::DEPTH)) {
+      check += " && !outside_z";
+      flat_coords += ", z_c";
+      c += "  for (int kz = 0; kz < args.kernel_size_z; ++kz) {\n";
+      c += "    int z_c = z_offseted + kz * args.dilation_z;\n";
+      c += "    bool outside_z = z_c < 0 || z_c >= args.src_tensor.Depth();\n";
     }
-    c += GetSrcValue(src_tensor, channel_multiplier,
-                     TextureAddressMode::DONT_CARE);
+    c += "  for (int ky = 0; ky < args.kernel_size_y; ++ky) {\n";
+    c += "    int y_c = y_offseted + ky * args.dilation_y;\n";
+    c += "    bool outside_y = y_c < 0 || y_c >= args.src_tensor.Height();\n";
+    c += "    for (int kx = 0; kx < args.kernel_size_x; ++kx) {\n";
+    c += "      int x_c = x_offseted + kx * args.dilation_x;\n";
+    c += "      bool outside_x = x_c < 0 || x_c >= args.src_tensor.Width();\n";
+    c += "      if (" + check + ") {\n";
+    if (weights_are_buffer) {
+      c += "        FLT4 f = args.weights.Read(fx_c);\n";
+    } else {
+      c += "        FLT4 f = args.weights.Read(fx_c, S);\n";
+    }
+    c += GetSrcValue(channel_multiplier, flat_coords);
     c += "        r += TO_ACCUM_TYPE(src_final * f);\n";
     c += "      };\n";
     c += "      fx_c++;\n";
     c += "    }\n";
     c += "  }\n";
+    if (op_def.dst_tensors[0].HasAxis(Axis::DEPTH)) {
+      c += "  }\n";
+    }
   } else {  // Texture types with ZERO clamping
-    c += "  for (int ky = 0; ky < kernel_size.y; ++ky) {\n";
-    c += "    int y_c = y_offseted + ky * dilation.y;\n";
-    c += "    for (int kx = 0; kx < kernel_size.x; ++kx) {\n";
-    c += "      int x_c = x_offseted + kx * dilation.x;\n";
-    const auto access_mode = GetFastestZeroMode(device);
-    c += GetSrcValue(src_tensor, channel_multiplier, access_mode);
+    if (op_def.dst_tensors[0].HasAxis(Axis::DEPTH)) {
+      flat_coords += ", z_c";
+      c += "  for (int kz = 0; kz < args.kernel_size_z; ++kz) {\n";
+      c += "    int z_c = z_offseted + kz * args.dilation_z;\n";
+      if (src_tensor_type !=
+          TensorStorageType::TEXTURE_3D) {  // Only TEXTURE_3D supports clamping
+                                            // in DEPTH dimension
+        c += "    if (z_c < 0 || z_c >= args.src_tensor.Depth()) {\n";
+        c += "      fx_c += args.kernel_size_y * args.kernel_size_x;\n";
+        c += "      continue;\n";
+        c += "    }\n";
+      }
+    }
+    c += "  for (int ky = 0; ky < args.kernel_size_y; ++ky) {\n";
+    c += "    int y_c = y_offseted + ky * args.dilation_y;\n";
+    c += "    for (int kx = 0; kx < args.kernel_size_x; ++kx) {\n";
+    c += "      int x_c = x_offseted + kx * args.dilation_x;\n";
+    c += GetSrcValue(channel_multiplier, flat_coords);
     if (weights_are_buffer) {
-      c += "      FLT4 f = filters[fx_c];\n";
+      c += "      FLT4 f = args.weights.Read(fx_c);\n";
     } else {
-      c += "      FLT4 f = READ_IMAGE(filters, smp_none, (int2)(fx_c, Z));\n";
+      c += "      FLT4 f = args.weights.Read(fx_c, S);\n";
     }
     c += "      fx_c++;\n";
     c += "      r += TO_ACCUM_TYPE(src_final * f);\n";
     c += "    }\n";
     c += "  }\n";
+    if (op_def.dst_tensors[0].HasAxis(Axis::DEPTH)) {
+      c += "  }\n";
+    }
+  }
+  c += "  FLT4 res0 = TO_FLT4(r) + args.biases.Read(S);\n";
+  if (op_def.dst_tensors[0].HasAxis(Axis::DEPTH)) {
+    c += "  args.dst_tensor.Write(res0, X, Y, Z, S);\n";
+  } else {
+    c += "  args.dst_tensor.Write(res0, X, Y, S);\n";
   }
-  c += "  FLT4 bias_val = " + biases.ReadLinearFLT4("Z") + ";\n";
-  c += "  FLT4 res0 = TO_FLT4(r) + bias_val;\n";
-  const LinkingContext context{"res0", "X", "Y", "Z"};
-  c += PostProcess(linked_operations, context);
-  c += "  " + dst_tensor.WriteWHS("res0", "X", "Y", "Z") + "\n";
   c += "}\n";
 
   return c;
@@ -184,20 +222,30 @@ DepthwiseConvolution::DepthwiseConvolution(
     const DepthwiseConvolution2DAttributes& attr, bool weights_are_buffer)
     : GPUOperation(definition),
       weights_are_buffer_(weights_are_buffer),
-      kernel_size_(attr.weights.shape.w, attr.weights.shape.h),
-      stride_(attr.strides.w, attr.strides.h),
-      padding_(-attr.padding.prepended.w, -attr.padding.prepended.h),
-      dilation_(attr.dilations.w, attr.dilations.h),
+      kernel_size_(attr.weights.shape.w, attr.weights.shape.h, 0, 0),
+      stride_(attr.strides.w, attr.strides.h, 0, 0),
+      padding_(-attr.padding.prepended.w, -attr.padding.prepended.h, 0, 0),
+      dilation_(attr.dilations.w, attr.dilations.h, 0, 0),
+      channel_multiplier_(attr.weights.shape.o),
+      work_group_size_(8, 8, 1) {}
+
+DepthwiseConvolution::DepthwiseConvolution(
+    const OperationDef& definition,
+    const DepthwiseConvolution3DAttributes& attr, bool weights_are_buffer)
+    : GPUOperation(definition),
+      weights_are_buffer_(weights_are_buffer),
+      kernel_size_(attr.weights.shape.w, attr.weights.shape.h,
+                   attr.weights.shape.d, 0),
+      stride_(attr.strides.w, attr.strides.h, attr.strides.d, 0),
+      padding_(-attr.padding.prepended.w, -attr.padding.prepended.h,
+               -attr.padding.prepended.d, 0),
+      dilation_(attr.dilations.w, attr.dilations.h, attr.dilations.d, 0),
       channel_multiplier_(attr.weights.shape.o),
       work_group_size_(8, 8, 1) {}
 
 DepthwiseConvolution::DepthwiseConvolution(DepthwiseConvolution&& operation)
     : GPUOperation(std::move(operation)),
       weights_are_buffer_(operation.weights_are_buffer_),
-      weights_tex2d_(std::move(operation.weights_tex2d_)),
-      weights_buf_(std::move(operation.weights_buf_)),
-      weights_(operation.weights_),
-      biases_(std::move(operation.biases_)),
       kernel_size_(operation.kernel_size_),
       stride_(operation.stride_),
       padding_(operation.padding_),
@@ -210,10 +258,6 @@ DepthwiseConvolution& DepthwiseConvolution::operator=(
     DepthwiseConvolution&& operation) {
   if (this != &operation) {
     std::swap(weights_are_buffer_, operation.weights_are_buffer_);
-    weights_tex2d_ = std::move(operation.weights_tex2d_);
-    weights_buf_ = std::move(operation.weights_buf_);
-    std::swap(weights_, operation.weights_);
-    biases_ = std::move(operation.biases_);
     std::swap(kernel_size_, operation.kernel_size_);
     std::swap(stride_, operation.stride_);
     std::swap(padding_, operation.padding_);
@@ -230,39 +274,48 @@ absl::Status DepthwiseConvolution::Compile(
     const CreationContext& creation_context) {
   const bool stride_correction =
       definition_.IsBatchSupported() && stride_.x != 1;
-  const auto code = GenerateDepthwiseConvolutionCode(
-      definition_, stride_correction, biases_, channel_multiplier_,
-      weights_are_buffer_, linked_operations_, *creation_context.device);
+  std::string code = GenerateDepthwiseConvolutionCode(
+      definition_, stride_correction, channel_multiplier_, weights_are_buffer_,
+      *creation_context.device, &args_);
+  std::string element_wise_code;
+  RETURN_IF_ERROR(
+      MergeOperations(linked_operations_, &args_, &element_wise_code));
+  RETURN_IF_ERROR(args_.TransformToCLCode(creation_context.device->GetInfo(),
+                                          {{"dst_tensor", element_wise_code}},
+                                          &code));
   return creation_context.cache->GetOrCreateCLKernel(
       code, "main_function", *creation_context.context,
       *creation_context.device, &kernel_);
 }
 
 absl::Status DepthwiseConvolution::BindArguments() {
-  kernel_.ResetBindingCounter();
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(weights_));
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(biases_.GetMemoryPtr()));
-  RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtrForWriting()));
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(kernel_size_));
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(stride_));
-  RETURN_IF_ERROR(
-      kernel_.SetBytesAuto(int2(padding_.x * src_[0]->Batch(), padding_.y)));
-  RETURN_IF_ERROR(
-      kernel_.SetBytesAuto(int2(dilation_.x * src_[0]->Batch(), dilation_.y)));
-  if (!IsSpecializedCase(channel_multiplier_)) {
-    RETURN_IF_ERROR(kernel_.SetBytesAuto(int32_t(channel_multiplier_)));
+  RETURN_IF_ERROR(args_.SetObjectRef("src_tensor", src_[0]));
+  RETURN_IF_ERROR(args_.SetObjectRef("dst_tensor", dst_[0]));
+  RETURN_IF_ERROR(args_.SetInt("kernel_size_x", kernel_size_.x));
+  RETURN_IF_ERROR(args_.SetInt("stride_x", stride_.x));
+  RETURN_IF_ERROR(args_.SetInt("padding_x", padding_.x * src_[0]->Batch()));
+  RETURN_IF_ERROR(args_.SetInt("dilation_x", dilation_.x * src_[0]->Batch()));
+  RETURN_IF_ERROR(args_.SetInt("kernel_size_y", kernel_size_.y));
+  RETURN_IF_ERROR(args_.SetInt("stride_y", stride_.y));
+  RETURN_IF_ERROR(args_.SetInt("padding_y", padding_.y));
+  RETURN_IF_ERROR(args_.SetInt("dilation_y", dilation_.y));
+  if (definition_.dst_tensors[0].HasAxis(Axis::DEPTH)) {
+    RETURN_IF_ERROR(args_.SetInt("kernel_size_z", kernel_size_.z));
+    RETURN_IF_ERROR(args_.SetInt("stride_z", stride_.z));
+    RETURN_IF_ERROR(args_.SetInt("padding_z", padding_.z));
+    RETURN_IF_ERROR(args_.SetInt("dilation_z", dilation_.z));
   }
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetWBatchedHSB()));
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetWBatchedHSB()));
-  return absl::OkStatus();
+  if (!IsSpecializedCase(channel_multiplier_)) {
+    RETURN_IF_ERROR(args_.SetInt("ch_multiplier", channel_multiplier_));
+  }
+  RETURN_IF_ERROR(SetArguments(linked_operations_, &args_));
+  return args_.Bind(kernel_.kernel());
 }
 
 int3 DepthwiseConvolution::GetGridSize() const {
   const int grid_x = dst_[0]->Width() * dst_[0]->Batch();
   const int grid_y = dst_[0]->Height();
-  const int grid_z = dst_[0]->Slices();
+  const int grid_z = dst_[0]->Slices() * dst_[0]->Depth();
   return int3(grid_x, grid_y, grid_z);
 }
 
@@ -284,14 +337,41 @@ absl::Status CreateDepthwiseConvolution(
   *result = DepthwiseConvolution(definition, attr, weights_are_buffer);
   RETURN_IF_ERROR(
       result->UploadWeights(attr.weights, creation_context.context));
-  LinearStorageCreateInfo create_info;
-  create_info.storage_type = weights_are_buffer ? LinearStorageType::BUFFER
-                                                : LinearStorageType::TEXTURE_2D;
-  create_info.data_type = definition.GetDataType();
-  create_info.name = "biases";
-  create_info.aligned_size = attr.weights.shape.o * attr.weights.shape.i;
-  RETURN_IF_ERROR(CreateLinearStorage(
-      create_info, attr.bias, creation_context.context, &result->biases_));
+
+  TensorLinearDescriptor desc;
+  desc.storage_type = weights_are_buffer ? LinearStorageType::BUFFER
+                                         : LinearStorageType::TEXTURE_2D;
+  desc.element_type = definition.GetDataType();
+
+  LinearStorage lt;
+  RETURN_IF_ERROR(
+      CreateLinearStorage(desc, attr.bias, creation_context.context, &lt));
+  result->args_.AddObject("biases", AccessType::READ,
+                          absl::make_unique<LinearStorage>(std::move(lt)),
+                          absl::make_unique<TensorLinearDescriptor>(desc));
+  return absl::OkStatus();
+}
+
+absl::Status CreateDepthwiseConvolution(
+    const CreationContext& creation_context, const OperationDef& definition,
+    const DepthwiseConvolution3DAttributes& attr,
+    DepthwiseConvolution* result) {
+  bool weights_are_buffer = creation_context.device->IsMali();
+  *result = DepthwiseConvolution(definition, attr, weights_are_buffer);
+  RETURN_IF_ERROR(
+      result->UploadWeights(attr.weights, creation_context.context));
+
+  TensorLinearDescriptor desc;
+  desc.storage_type = weights_are_buffer ? LinearStorageType::BUFFER
+                                         : LinearStorageType::TEXTURE_2D;
+  desc.element_type = definition.GetDataType();
+
+  LinearStorage lt;
+  RETURN_IF_ERROR(
+      CreateLinearStorage(desc, attr.bias, creation_context.context, &lt));
+  result->args_.AddObject("biases", AccessType::READ,
+                          absl::make_unique<LinearStorage>(std::move(lt)),
+                          absl::make_unique<TensorLinearDescriptor>(desc));
   return absl::OkStatus();
 }
 
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv.h b/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv.h
index 7655f2abae0..30cd3d06a5a 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv.h
@@ -54,9 +54,17 @@ class DepthwiseConvolution : public GPUOperation {
       const CreationContext& creation_context, const OperationDef& definition,
       const DepthwiseConvolution2DAttributes& attr,
       DepthwiseConvolution* result);
+  friend absl::Status CreateDepthwiseConvolution(
+      const CreationContext& creation_context, const OperationDef& definition,
+      const DepthwiseConvolution3DAttributes& attr,
+      DepthwiseConvolution* result);
   DepthwiseConvolution(const OperationDef& definition,
                        const DepthwiseConvolution2DAttributes& attr,
                        bool weights_are_buffer);
+  DepthwiseConvolution(const OperationDef& definition,
+                       const DepthwiseConvolution3DAttributes& attr,
+                       bool weights_are_buffer);
+
   template <DataType T>
   absl::Status UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights,
                              CLContext* context);
@@ -65,20 +73,23 @@ class DepthwiseConvolution : public GPUOperation {
   void RearrangeWeightsData(const tflite::gpu::Tensor<OHWI, S>& weights,
                             absl::Span<T> dst);
 
+  template <DataType T>
+  absl::Status UploadWeights(const tflite::gpu::Tensor<OHWDI, T>& weights,
+                             CLContext* context);
+
+  template <DataType S, typename T>
+  void RearrangeWeightsData(const tflite::gpu::Tensor<OHWDI, S>& weights,
+                            absl::Span<T> dst);
+
   absl::Status BindArguments();
   int3 GetGridSize() const;
 
   bool weights_are_buffer_;
-  Texture2D weights_tex2d_;
-  Buffer weights_buf_;
-  cl_mem weights_;
 
-  LinearStorage biases_;
-
-  int2 kernel_size_;
-  int2 stride_;
-  int2 padding_;
-  int2 dilation_;
+  int4 kernel_size_;
+  int4 stride_;
+  int4 padding_;
+  int4 dilation_;
   int channel_multiplier_;
 
   CLKernel kernel_;
@@ -89,26 +100,28 @@ template <DataType T>
 absl::Status DepthwiseConvolution::UploadWeights(
     const tflite::gpu::Tensor<OHWI, T>& weights, CLContext* context) {
   const int dst_channels = weights.shape.i * weights.shape.o;
-  const int dst_depth = DivideRoundUp(dst_channels, 4);
+  const int dst_slices = DivideRoundUp(dst_channels, 4);
   const int kernel_x = weights.shape.w;
   const int kernel_y = weights.shape.h;
 
-  const int elements_count = kernel_x * kernel_y * dst_depth;
+  const int elements_count = kernel_x * kernel_y * dst_slices;
 
   const bool fp32_weights = definition_.precision == CalculationsPrecision::F32;
   const int float4_size = fp32_weights ? 16 : 8;
 
+  Texture2D weights_tex2d;
+  Buffer weights_buf;
   if (fp32_weights) {
     std::vector<float4> gpu_data(elements_count);
     RearrangeWeightsData(weights, absl::MakeSpan(gpu_data));
     if (weights_are_buffer_) {
       RETURN_IF_ERROR(CreateReadOnlyBuffer(float4_size * elements_count,
                                            gpu_data.data(), context,
-                                           &weights_buf_));
+                                           &weights_buf));
     } else {
       RETURN_IF_ERROR(CreateTexture2DRGBA(
-          definition_.GetDataType(), kernel_x * kernel_y, dst_depth,
-          gpu_data.data(), context, &weights_tex2d_));
+          definition_.GetDataType(), kernel_x * kernel_y, dst_slices,
+          gpu_data.data(), context, &weights_tex2d));
     }
   } else {
     std::vector<half4> gpu_data(elements_count);
@@ -116,18 +129,27 @@ absl::Status DepthwiseConvolution::UploadWeights(
     if (weights_are_buffer_) {
       RETURN_IF_ERROR(CreateReadOnlyBuffer(float4_size * elements_count,
                                            gpu_data.data(), context,
-                                           &weights_buf_));
+                                           &weights_buf));
     } else {
       RETURN_IF_ERROR(CreateTexture2DRGBA(
-          definition_.GetDataType(), kernel_x * kernel_y, dst_depth,
-          gpu_data.data(), context, &weights_tex2d_));
+          definition_.GetDataType(), kernel_x * kernel_y, dst_slices,
+          gpu_data.data(), context, &weights_tex2d));
     }
   }
 
   if (weights_are_buffer_) {
-    weights_ = weights_buf_.GetMemoryPtr();
+    BufferDescriptor desc;
+    desc.element_type = fp32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
+    desc.element_size = 4;
+    args_.AddObject("weights", AccessType::READ,
+                    absl::make_unique<Buffer>(std::move(weights_buf)),
+                    absl::make_unique<BufferDescriptor>(desc));
   } else {
-    weights_ = weights_tex2d_.GetMemoryPtr();
+    Texture2DDescriptor desc;
+    desc.element_type = fp32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
+    args_.AddObject("weights", AccessType::READ,
+                    absl::make_unique<Texture2D>(std::move(weights_tex2d)),
+                    absl::make_unique<Texture2DDescriptor>(desc));
   }
 
   return absl::OkStatus();
@@ -162,6 +184,98 @@ void DepthwiseConvolution::RearrangeWeightsData(
   }
 }
 
+template <DataType T>
+absl::Status DepthwiseConvolution::UploadWeights(
+    const tflite::gpu::Tensor<OHWDI, T>& weights, CLContext* context) {
+  const int dst_channels = weights.shape.i * weights.shape.o;
+  const int dst_slices = DivideRoundUp(dst_channels, 4);
+  const int kernel_x = weights.shape.w;
+  const int kernel_y = weights.shape.h;
+  const int kernel_z = weights.shape.d;
+
+  const int elements_count = kernel_x * kernel_y * kernel_z * dst_slices;
+
+  const bool fp32_weights = definition_.precision == CalculationsPrecision::F32;
+  const int float4_size = fp32_weights ? 16 : 8;
+
+  Texture2D weights_tex2d;
+  Buffer weights_buf;
+  if (fp32_weights) {
+    std::vector<float4> gpu_data(elements_count);
+    RearrangeWeightsData(weights, absl::MakeSpan(gpu_data));
+    if (weights_are_buffer_) {
+      RETURN_IF_ERROR(CreateReadOnlyBuffer(float4_size * elements_count,
+                                           gpu_data.data(), context,
+                                           &weights_buf));
+    } else {
+      RETURN_IF_ERROR(CreateTexture2DRGBA(
+          definition_.GetDataType(), kernel_x * kernel_y * kernel_z, dst_slices,
+          gpu_data.data(), context, &weights_tex2d));
+    }
+  } else {
+    std::vector<half4> gpu_data(elements_count);
+    RearrangeWeightsData(weights, absl::MakeSpan(gpu_data));
+    if (weights_are_buffer_) {
+      RETURN_IF_ERROR(CreateReadOnlyBuffer(float4_size * elements_count,
+                                           gpu_data.data(), context,
+                                           &weights_buf));
+    } else {
+      RETURN_IF_ERROR(CreateTexture2DRGBA(
+          definition_.GetDataType(), kernel_x * kernel_y * kernel_z, dst_slices,
+          gpu_data.data(), context, &weights_tex2d));
+    }
+  }
+
+  if (weights_are_buffer_) {
+    BufferDescriptor desc;
+    desc.element_type = fp32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
+    desc.element_size = 4;
+    args_.AddObject("weights", AccessType::READ,
+                    absl::make_unique<Buffer>(std::move(weights_buf)),
+                    absl::make_unique<BufferDescriptor>(desc));
+  } else {
+    Texture2DDescriptor desc;
+    desc.element_type = fp32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
+    args_.AddObject("weights", AccessType::READ,
+                    absl::make_unique<Texture2D>(std::move(weights_tex2d)),
+                    absl::make_unique<Texture2DDescriptor>(desc));
+  }
+
+  return absl::OkStatus();
+}
+
+template <DataType S, typename T>
+void DepthwiseConvolution::RearrangeWeightsData(
+    const tflite::gpu::Tensor<OHWDI, S>& weights, absl::Span<T> dst) {
+  const int dst_channels = weights.shape.i * weights.shape.o;
+  const int dst_slices = DivideRoundUp(dst_channels, 4);
+  const int kernel_x = weights.shape.w;
+  const int kernel_y = weights.shape.h;
+  const int kernel_z = weights.shape.d;
+
+  int counter = 0;
+  for (int d = 0; d < dst_slices; ++d) {
+    for (int z = 0; z < kernel_z; ++z) {
+      for (int y = 0; y < kernel_y; ++y) {
+        for (int x = 0; x < kernel_x; ++x) {
+          T filter_val;
+          for (int i = 0; i < 4; ++i) {
+            const int d_ch = d * 4 + i;
+            if (d_ch < dst_channels) {
+              const int f_index = weights.shape.LinearIndex(
+                  {d_ch % weights.shape.o, y, x, z, d_ch / weights.shape.o});
+              filter_val[i] = weights.data[f_index];
+            } else {
+              filter_val[i] = 0.0f;
+            }
+          }
+          dst[counter++] = filter_val;
+        }
+      }
+    }
+  }
+}
+
 absl::Status CreateDepthwiseConvolution(
     const CreationContext& creation_context, const OperationDef& definition,
     const DepthwiseConvolution2DAttributes& attr, DepthwiseConvolution* result);
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv_3d.cc b/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv_3d.cc
deleted file mode 100644
index f9926a9f466..00000000000
--- a/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv_3d.cc
+++ /dev/null
@@ -1,338 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv_3d.h"
-
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "tensorflow/lite/delegates/gpu/cl/cl_device.h"
-#include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
-#include "tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.h"
-
-namespace tflite {
-namespace gpu {
-namespace cl {
-namespace {
-
-bool IsSpecializedCase(int channel_multiplier) {
-  return channel_multiplier == 1 || channel_multiplier == 2 ||
-         channel_multiplier == 4;
-}
-
-std::string GetSrcValue(const TensorCodeGenerator& src_tensor,
-                        int channel_multiplier,
-                        TextureAddressMode address_mode) {
-  std::string c;
-  if (channel_multiplier == 1) {
-    c += "        FLT4 src_final =" +
-         src_tensor.ReadWHDS("x_c", "y_c", "z_c", "S", address_mode) + ";\n";
-  } else if (channel_multiplier == 2) {
-    c += "        int z_layer = S / 2;\n";
-    c += "        FLT4 src =" +
-         src_tensor.ReadWHDS("x_c", "y_c", "z_c", "z_layer", address_mode) +
-         ";\n";
-    c += "        FLT2 t0 = S % 2 == 0 ? src.xy : src.zw;\n";
-    c += "        FLT4 src_final = (FLT4)(t0.x, t0.x, t0.y, t0.y);\n";
-  } else if (channel_multiplier == 4) {
-    c += "        int z_layer = S / 4;\n";
-    c += "        FLT4 src =" +
-         src_tensor.ReadWHDS("x_c", "y_c", "z_c", "z_layer", address_mode) +
-         ";\n";
-    c += "        FLT t0 = src.x;\n";
-    c += "        int reminder = S % 4;\n";
-    c += "        if (reminder == 1) t0 = src.y;\n";
-    c += "        if (reminder == 2) t0 = src.z;\n";
-    c += "        if (reminder == 3) t0 = src.w;\n";
-    c += "        FLT4 src_final = (FLT4)(t0, t0, t0, t0);\n";
-  } else {
-    c += "        int z_layer = S / channel_multiplier;\n";
-    c += "        FLT4 src =" +
-         src_tensor.ReadWHDS("x_c", "y_c", "z_c", "z_layer", address_mode) +
-         ";\n";
-    c += "        int z_offset = (S % channel_multiplier) * 4;\n";
-    c += "        FLT4 src_final;\n";
-    c += "        FLT temp_arr[4] = {src.x, src.y, src.z, src.w};\n";
-    c += "        src_final.x = temp_arr[(z_offset + 0) / "
-         "channel_multiplier];\n";
-    c += "        src_final.y = temp_arr[(z_offset + 1) / "
-         "channel_multiplier];\n";
-    c += "        src_final.z = temp_arr[(z_offset + 2) / "
-         "channel_multiplier];\n";
-    c += "        src_final.w = temp_arr[(z_offset + 3) / "
-         "channel_multiplier];\n";
-  }
-
-  return c;
-}
-
-std::string GenerateDepthwiseConvolution3DCode(
-    const OperationDef& op_def, bool stride_correction,
-    const LinearStorage& biases, int channel_multiplier,
-    bool weights_are_buffer,
-    const std::vector<ElementwiseOperation*>& linked_operations,
-    const CLDevice& device) {
-  TensorCodeGenerator src_tensor(
-      "src_data",
-      WHDSPoint{"src_size.x", "src_size.y", "src_size.z", "src_size.w"},
-      op_def.src_tensors[0]);
-  TensorCodeGenerator dst_tensor(
-      "dst_data",
-      WHDSPoint{"dst_size.x", "dst_size.y", "dst_size.z", "dst_size.w"},
-      op_def.dst_tensors[0]);
-  const auto src_tensor_type = op_def.src_tensors[0].storage_type;
-
-  std::string c = GetCommonDefines(op_def.precision);
-
-  const bool manual_clamp = src_tensor_type == TensorStorageType::BUFFER ||
-                            src_tensor_type == TensorStorageType::IMAGE_BUFFER;
-
-  c += "__kernel void main_function(\n";
-  c += src_tensor.GetDeclaration(AccessType::READ) + ",\n";
-  if (weights_are_buffer) {
-    c += "    __global FLT4* filters,  \n";
-  } else {
-    c += "    __read_only image2d_t filters,  \n";
-  }
-  c += biases.GetDeclaration();
-  c += GetArgsDeclaration(linked_operations);
-  c += dst_tensor.GetDeclaration(AccessType::WRITE) + ",\n";
-  c += "    int4 kernel_size,                \n";
-  c += "    int4 stride,                     \n";
-  c += "    int4 padding,                    \n";
-  c += "    int4 dilation,                   \n";
-  if (!IsSpecializedCase(channel_multiplier)) {
-    c += "    int channel_multiplier,            \n";
-  }
-  if (op_def.IsBatchSupported()) {
-    c += "    int batch_size,          \n";
-  }
-  c += "    int4 src_size,                   \n";
-  c += "    int4 dst_size                    \n";
-  c += ") {\n";
-  c += "  int X = get_global_id(0);\n";
-  c += "  int Y = get_global_id(1);\n";
-  c += "  int linear_id_z = get_global_id(2);\n";
-  c += "  int S = linear_id_z % dst_size.w;\n";
-  c += "  int Z = linear_id_z / dst_size.w;\n";
-  c += "  if (X >= dst_size.x || Y >= dst_size.y || Z >= dst_size.z) return;\n";
-  c += "  ACCUM_FLT4 r = (ACCUM_FLT4)(0.0f, 0.0f, 0.0f, 0.0f);\n";
-  if (stride_correction) {
-    c += "  int x_offseted = " +
-         GetXStrideCorrected("X", "batch_size", "stride.x", "padding.x") +
-         ";\n";
-  } else {
-    c += "  int x_offseted = X * stride.x + padding.x;\n";
-  }
-  c += "  int y_offseted = Y * stride.y + padding.y;\n";
-  c += "  int z_offseted = Z * stride.z + padding.z;\n";
-  if (weights_are_buffer) {
-    c += "  int fx_c = S * kernel_size.x * kernel_size.y * kernel_size.z;\n";
-  } else {
-    c += "  int fx_c = 0;\n";
-  }
-
-  if (manual_clamp) {
-    c += "  for (int kz = 0; kz < kernel_size.z; ++kz) {\n";
-    c += "    int z_c = z_offseted + kz * dilation.z;\n";
-    c += "    bool outside_z = z_c < 0 || z_c >= src_size.z;\n";
-    c += "    for (int ky = 0; ky < kernel_size.y; ++ky) {\n";
-    c += "      int y_c = y_offseted + ky * dilation.y;\n";
-    c += "      bool outside_y = y_c < 0 || y_c >= src_size.y;\n";
-    c += "      for (int kx = 0; kx < kernel_size.x; ++kx) {\n";
-    c += "        int x_c = x_offseted + kx * dilation.x;\n";
-    c += "        bool outside_x = x_c < 0 || x_c >= src_size.x;\n";
-    c += "        if (!outside_x && !outside_y && !outside_z) {\n";
-    if (weights_are_buffer) {
-      c += "          FLT4 f = filters[fx_c];\n";
-    } else {
-      c += "          FLT4 f = READ_IMAGE(filters, smp_none, (int2)(fx_c, "
-           "S));\n";
-    }
-    c += GetSrcValue(src_tensor, channel_multiplier,
-                     TextureAddressMode::DONT_CARE);
-    c += "          r += TO_ACCUM_TYPE(src_final * f);\n";
-    c += "        };\n";
-    c += "        fx_c++;\n";
-    c += "      }\n";
-    c += "    }\n";
-    c += "  }\n";
-  } else {  // Texture types with ZERO clamping
-    c += "  for (int kz = 0; kz < kernel_size.z; ++kz) {\n";
-    c += "    int z_c = z_offseted + kz * dilation.z;\n";
-    if (src_tensor_type !=
-        TensorStorageType::TEXTURE_3D) {  // Only TEXTURE_3D supports clamping
-                                          // in DEPTH dimension
-      c += "    if (z_c < 0 || z_c >= src_size.z) {\n";
-      c += "      fx_c += kernel_size.y * kernel_size.x;\n";
-      c += "      continue;\n";
-      c += "    }\n";
-    }
-    c += "    for (int ky = 0; ky < kernel_size.y; ++ky) {\n";
-    c += "      int y_c = y_offseted + ky * dilation.y;\n";
-    c += "      for (int kx = 0; kx < kernel_size.x; ++kx) {\n";
-    c += "        int x_c = x_offseted + kx * dilation.x;\n";
-    const auto access_mode = GetFastestZeroMode(device);
-    c += GetSrcValue(src_tensor, channel_multiplier, access_mode);
-    if (weights_are_buffer) {
-      c += "        FLT4 f = filters[fx_c];\n";
-    } else {
-      c += "        FLT4 f = READ_IMAGE(filters, smp_none, (int2)(fx_c, S));\n";
-    }
-    c += "        fx_c++;\n";
-    c += "        r += TO_ACCUM_TYPE(src_final * f);\n";
-    c += "      }\n";
-    c += "    }\n";
-    c += "  }\n";
-  }
-  c += "  FLT4 bias_val = " + biases.ReadLinearFLT4("S") + ";\n";
-  c += "  FLT4 res0 = TO_FLT4(r) + bias_val;\n";
-  const LinkingContext context{"res0", "X", "Y", "S"};
-  c += PostProcess(linked_operations, context);
-  c += "  " + dst_tensor.WriteWHDS("res0", "X", "Y", "Z", "S") + "\n";
-  c += "}\n";
-  return c;
-}
-}  // namespace
-
-DepthwiseConvolution3D::DepthwiseConvolution3D(
-    const OperationDef& definition,
-    const DepthwiseConvolution3DAttributes& attr, const CLDevice& device)
-    : GPUOperation(definition),
-      weights_are_buffer_(device.IsMali()),
-      kernel_size_(attr.weights.shape.w, attr.weights.shape.h,
-                   attr.weights.shape.d),
-      stride_(attr.strides.w, attr.strides.h, attr.strides.d),
-      padding_(-attr.padding.prepended.w, -attr.padding.prepended.h,
-               -attr.padding.prepended.d),
-      dilation_(attr.dilations.w, attr.dilations.h, attr.dilations.d),
-      channel_multiplier_(attr.weights.shape.o),
-      work_group_size_(8, 8, 1) {}
-
-DepthwiseConvolution3D::DepthwiseConvolution3D(
-    DepthwiseConvolution3D&& operation)
-    : GPUOperation(std::move(operation)),
-      weights_tex2d_(std::move(operation.weights_tex2d_)),
-      weights_buf_(std::move(operation.weights_buf_)),
-      weights_are_buffer_(operation.weights_are_buffer_),
-      biases_(std::move(operation.biases_)),
-      kernel_size_(operation.kernel_size_),
-      stride_(operation.stride_),
-      padding_(operation.padding_),
-      dilation_(operation.dilation_),
-      channel_multiplier_(operation.channel_multiplier_),
-      kernel_(std::move(operation.kernel_)),
-      work_group_size_(operation.work_group_size_) {}
-
-DepthwiseConvolution3D& DepthwiseConvolution3D::operator=(
-    DepthwiseConvolution3D&& operation) {
-  if (this != &operation) {
-    weights_tex2d_ = std::move(operation.weights_tex2d_);
-    weights_buf_ = std::move(operation.weights_buf_);
-    std::swap(weights_are_buffer_, operation.weights_are_buffer_);
-    biases_ = std::move(operation.biases_);
-    std::swap(kernel_size_, operation.kernel_size_);
-    std::swap(stride_, operation.stride_);
-    std::swap(padding_, operation.padding_);
-    std::swap(dilation_, operation.dilation_);
-    std::swap(channel_multiplier_, operation.channel_multiplier_);
-    kernel_ = std::move(operation.kernel_);
-    std::swap(work_group_size_, operation.work_group_size_);
-    GPUOperation::operator=(std::move(operation));
-  }
-  return *this;
-}
-
-absl::Status DepthwiseConvolution3D::Compile(
-    const CreationContext& creation_context) {
-  const bool stride_correction =
-      definition_.IsBatchSupported() && stride_.x != 1;
-  const auto code = GenerateDepthwiseConvolution3DCode(
-      definition_, stride_correction, biases_, channel_multiplier_,
-      weights_are_buffer_, linked_operations_, *creation_context.device);
-  return creation_context.cache->GetOrCreateCLKernel(
-      code, "main_function", *creation_context.context,
-      *creation_context.device, &kernel_);
-}
-
-absl::Status DepthwiseConvolution3D::BindArguments() {
-  kernel_.ResetBindingCounter();
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
-  if (weights_are_buffer_) {
-    RETURN_IF_ERROR(kernel_.SetMemoryAuto(weights_buf_.GetMemoryPtr()));
-  } else {
-    RETURN_IF_ERROR(kernel_.SetMemoryAuto(weights_tex2d_.GetMemoryPtr()));
-  }
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(biases_.GetMemoryPtr()));
-  RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtrForWriting()));
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(
-      int4(kernel_size_.x, kernel_size_.y, kernel_size_.z, 1)));
-  RETURN_IF_ERROR(
-      kernel_.SetBytesAuto(int4(stride_.x, stride_.y, stride_.z, 1)));
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(
-      int4(padding_.x * src_[0]->Batch(), padding_.y, padding_.z, 1)));
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(
-      int4(dilation_.x * src_[0]->Batch(), dilation_.y, dilation_.z, 1)));
-  if (!IsSpecializedCase(channel_multiplier_)) {
-    RETURN_IF_ERROR(kernel_.SetBytesAuto(int32_t(channel_multiplier_)));
-  }
-  if (definition_.IsBatchSupported()) {
-    RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->Batch()));
-  }
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetWBatchedHDS()));
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetWBatchedHDS()));
-  return absl::OkStatus();
-}
-
-int3 DepthwiseConvolution3D::GetGridSize() const {
-  const int grid_x = dst_[0]->Width() * dst_[0]->Batch();
-  const int grid_y = dst_[0]->Height();
-  const int grid_z = dst_[0]->Slices() * dst_[0]->Depth();
-  return int3(grid_x, grid_y, grid_z);
-}
-
-absl::Status DepthwiseConvolution3D::Tune(const TuningParameters& params) {
-  RETURN_IF_ERROR(BindArguments());
-  return GetBestWorkGroup(params, kernel_, GetGridSize(), &work_group_size_);
-}
-
-absl::Status DepthwiseConvolution3D::AddToQueue(CLCommandQueue* queue) {
-  RETURN_IF_ERROR(BindArguments());
-  return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
-}
-
-absl::Status CreateDepthwiseConvolution3D(
-    const CreationContext& creation_context, const OperationDef& definition,
-    const DepthwiseConvolution3DAttributes& attr,
-    DepthwiseConvolution3D* result) {
-  *result = DepthwiseConvolution3D(definition, attr, *creation_context.device);
-  RETURN_IF_ERROR(
-      result->UploadWeights(attr.weights, creation_context.context));
-  LinearStorageCreateInfo create_info;
-  create_info.storage_type =
-      DeduceLinearStorageType(definition.GetPrimaryStorageType());
-  create_info.data_type = definition.GetDataType();
-  create_info.name = "biases";
-  create_info.aligned_size = attr.weights.shape.o * attr.weights.shape.i;
-  RETURN_IF_ERROR(CreateLinearStorage(
-      create_info, attr.bias, creation_context.context, &result->biases_));
-  return absl::OkStatus();
-}
-
-}  // namespace cl
-}  // namespace gpu
-}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv_3d.h b/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv_3d.h
deleted file mode 100644
index 3c87ba5832c..00000000000
--- a/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv_3d.h
+++ /dev/null
@@ -1,170 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_DEPTHWISE_CONV_3D_H_
-#define TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_DEPTHWISE_CONV_3D_H_
-
-#include <vector>
-
-#include "tensorflow/lite/delegates/gpu/cl/buffer.h"
-#include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
-#include "tensorflow/lite/delegates/gpu/cl/linear_storage.h"
-#include "tensorflow/lite/delegates/gpu/cl/tensor.h"
-#include "tensorflow/lite/delegates/gpu/cl/texture2d.h"
-#include "tensorflow/lite/delegates/gpu/cl/util.h"
-#include "tensorflow/lite/delegates/gpu/common/data_type.h"
-#include "tensorflow/lite/delegates/gpu/common/operations.h"
-#include "tensorflow/lite/delegates/gpu/common/shape.h"
-#include "tensorflow/lite/delegates/gpu/common/status.h"
-#include "tensorflow/lite/delegates/gpu/common/tensor.h"
-#include "tensorflow/lite/delegates/gpu/common/types.h"
-
-namespace tflite {
-namespace gpu {
-namespace cl {
-
-class DepthwiseConvolution3D : public GPUOperation {
- public:
-  DepthwiseConvolution3D() = default;
-  absl::Status AddToQueue(CLCommandQueue* queue) override;
-  absl::Status Tune(const TuningParameters& params) override;
-
-  absl::Status Compile(const CreationContext& creation_context) override;
-
-  // Move only
-  DepthwiseConvolution3D(DepthwiseConvolution3D&& operation);
-  DepthwiseConvolution3D& operator=(DepthwiseConvolution3D&& operation);
-  DepthwiseConvolution3D(const DepthwiseConvolution3D&) = delete;
-  DepthwiseConvolution3D& operator=(const DepthwiseConvolution3D&) = delete;
-
- private:
-  friend absl::Status CreateDepthwiseConvolution3D(
-      const CreationContext& creation_context, const OperationDef& definition,
-      const DepthwiseConvolution3DAttributes& attr,
-      DepthwiseConvolution3D* result);
-  DepthwiseConvolution3D(const OperationDef& definition,
-                         const DepthwiseConvolution3DAttributes& attr,
-                         const CLDevice& device);
-  template <DataType T>
-  absl::Status UploadWeights(const tflite::gpu::Tensor<OHWDI, T>& weights,
-                             CLContext* context);
-
-  template <DataType S, typename T>
-  void RearrangeWeightsData(const tflite::gpu::Tensor<OHWDI, S>& weights,
-                            absl::Span<T> dst);
-
-  absl::Status BindArguments();
-  int3 GetGridSize() const;
-
-  Texture2D weights_tex2d_;
-  Buffer weights_buf_;
-  bool weights_are_buffer_;
-
-  LinearStorage biases_;
-
-  int3 kernel_size_;
-  int3 stride_;
-  int3 padding_;
-  int3 dilation_;
-  int channel_multiplier_;
-
-  CLKernel kernel_;
-  int3 work_group_size_;
-};
-
-template <DataType T>
-absl::Status DepthwiseConvolution3D::UploadWeights(
-    const tflite::gpu::Tensor<OHWDI, T>& weights, CLContext* context) {
-  const int dst_channels = weights.shape.i * weights.shape.o;
-  const int dst_slices = DivideRoundUp(dst_channels, 4);
-  const int kernel_x = weights.shape.w;
-  const int kernel_y = weights.shape.h;
-  const int kernel_z = weights.shape.d;
-
-  const int elements_count = kernel_x * kernel_y * kernel_z * dst_slices;
-  const bool f32_weights = definition_.precision == CalculationsPrecision::F32;
-
-  const int float4_size = f32_weights ? 16 : 8;
-
-  if (f32_weights) {
-    std::vector<float4> gpu_data(elements_count);
-    RearrangeWeightsData(weights, absl::MakeSpan(gpu_data));
-    if (weights_are_buffer_) {
-      RETURN_IF_ERROR(CreateReadOnlyBuffer(float4_size * elements_count,
-                                           gpu_data.data(), context,
-                                           &weights_buf_));
-    } else {
-      RETURN_IF_ERROR(CreateTexture2DRGBA(
-          definition_.GetDataType(), kernel_x * kernel_y * kernel_z, dst_slices,
-          gpu_data.data(), context, &weights_tex2d_));
-    }
-  } else {
-    std::vector<half4> gpu_data(elements_count);
-    RearrangeWeightsData(weights, absl::MakeSpan(gpu_data));
-    if (weights_are_buffer_) {
-      RETURN_IF_ERROR(CreateReadOnlyBuffer(float4_size * elements_count,
-                                           gpu_data.data(), context,
-                                           &weights_buf_));
-    } else {
-      RETURN_IF_ERROR(CreateTexture2DRGBA(
-          definition_.GetDataType(), kernel_x * kernel_y * kernel_z, dst_slices,
-          gpu_data.data(), context, &weights_tex2d_));
-    }
-  }
-  return absl::OkStatus();
-}
-
-template <DataType S, typename T>
-void DepthwiseConvolution3D::RearrangeWeightsData(
-    const tflite::gpu::Tensor<OHWDI, S>& weights, absl::Span<T> dst) {
-  const int dst_channels = weights.shape.i * weights.shape.o;
-  const int dst_slices = DivideRoundUp(dst_channels, 4);
-  const int kernel_x = weights.shape.w;
-  const int kernel_y = weights.shape.h;
-  const int kernel_z = weights.shape.d;
-
-  int counter = 0;
-  for (int d = 0; d < dst_slices; ++d) {
-    for (int z = 0; z < kernel_z; ++z) {
-      for (int y = 0; y < kernel_y; ++y) {
-        for (int x = 0; x < kernel_x; ++x) {
-          T filter_val;
-          for (int i = 0; i < 4; ++i) {
-            const int d_ch = d * 4 + i;
-            if (d_ch < dst_channels) {
-              const int f_index = weights.shape.LinearIndex(
-                  {d_ch % weights.shape.o, y, x, z, d_ch / weights.shape.o});
-              filter_val[i] = weights.data[f_index];
-            } else {
-              filter_val[i] = 0.0f;
-            }
-          }
-          dst[counter++] = filter_val;
-        }
-      }
-    }
-  }
-}
-
-absl::Status CreateDepthwiseConvolution3D(
-    const CreationContext& creation_context, const OperationDef& definition,
-    const DepthwiseConvolution3DAttributes& attr,
-    DepthwiseConvolution3D* result);
-
-}  // namespace cl
-}  // namespace gpu
-}  // namespace tflite
-
-#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_DEPTHWISE_CONV_3D_H_
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/fully_connected.cc b/tensorflow/lite/delegates/gpu/cl/kernels/fully_connected.cc
index f93648f82fc..bc287ec2fee 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/fully_connected.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/fully_connected.cc
@@ -31,14 +31,15 @@ namespace {
 // Good results for ~1024 x 1024 sizes, for other can be written more
 // optimized shaders
 
-std::string GetFullyConnectedKernelCode(
-    const OperationDef& op_def, const LinearStorage& biases,
-    const std::vector<ElementwiseOperation*>& linked_operations,
-    const int3& work_group_size) {
-  TensorCodeGenerator src_tensor("src_data", WHSPoint{"1", "1", "depthes.x"},
-                                 op_def.src_tensors[0]);
-  TensorCodeGenerator dst_tensor("dst_data", WHSPoint{"1", "1", "depthes.y"},
-                                 op_def.dst_tensors[0]);
+std::string GetFullyConnectedKernelCode(const OperationDef& op_def,
+                                        const int3& work_group_size,
+                                        Arguments* args) {
+  args->AddObjectRef(
+      "src_tensor", AccessType::READ,
+      absl::make_unique<TensorDescriptor>(op_def.src_tensors[0]));
+  args->AddObjectRef(
+      "dst_tensor", AccessType::WRITE,
+      absl::make_unique<TensorDescriptor>(op_def.dst_tensors[0]));
 
   std::string c = GetCommonDefines(op_def.precision);
   switch (op_def.precision) {
@@ -54,21 +55,16 @@ std::string GetFullyConnectedKernelCode(
   const std::string wg_x = std::to_string(work_group_size.x);
   const std::string wg_y = std::to_string(work_group_size.y);
   c += "__kernel void main_function(\n";
-  c += src_tensor.GetDeclaration(AccessType::READ) + ",\n";
-  c += "    __global FLT16* filters,      \n";
-  c += biases.GetDeclaration();
-  c += GetArgsDeclaration(linked_operations);
-  c += dst_tensor.GetDeclaration(AccessType::WRITE) + ",\n";
-  c += "    int2 depthes                  \n";
-  c += ") {\n";
+  c += "$0) {\n";
   c += "  int gid = get_global_id(0);\n";
-  c += "  bool inside = gid < depthes.y;\n";
-  c += "  gid = min(gid, depthes.y - 1);\n";
+  c += "  bool inside = gid < args.dst_tensor.Slices();\n";
+  c += "  gid = min(gid, args.dst_tensor.Slices() - 1);\n";
   c += "  int2 tid = (int2)(get_local_id(0), get_local_id(1));\n";
   c += "  ACCUM_FLT4 s = (ACCUM_FLT4)(0.0f);\n";
-  c += "  for (uint c = tid.y; c < depthes.x; c += " + wg_y + ") {\n";
-  c += "    FLT4 v = " + src_tensor.ReadWHS("0", "0", "c") + ";\n";
-  c += "    FLT16 w = filters[c * depthes.y + gid];\n";
+  c += "  for (uint c = tid.y; c < args.src_tensor.Slices(); c += " + wg_y +
+       ") {\n";
+  c += "    FLT4 v = args.src_tensor.Read(0, 0, c);\n";
+  c += "    FLT16 w = args.weights.Read(c * args.dst_tensor.Slices() + gid);\n";
   c += "    s.x += dot(v, w.s0123);\n";
   c += "    s.y += dot(v, w.s4567);\n";
   c += "    s.z += dot(v, w.s89ab);\n";
@@ -81,10 +77,8 @@ std::string GetFullyConnectedKernelCode(
   for (int i = 1; i < work_group_size.y; ++i) {
     c += "    s += temp[tid.x][" + std::to_string(i) + "];\n";
   }
-  c += "    FLT4 r0 = TO_FLT4(s) + " + biases.ReadLinearFLT4("gid") + ";\n";
-  const LinkingContext context{"r0", "0", "0", "gid"};
-  c += PostProcess(linked_operations, context);
-  c += "  " + dst_tensor.WriteWHS("r0", "0", "0", "gid") + "\n";
+  c += "    FLT4 r0 = TO_FLT4(s) + args.biases.Read(gid);\n";
+  c += "    args.dst_tensor.Write(r0, 0, 0, gid);\n";
   c += "  }\n";
   c += "}\n";
 
@@ -97,15 +91,11 @@ FullyConnected::FullyConnected(const OperationDef& definition)
 
 FullyConnected::FullyConnected(FullyConnected&& kernel)
     : GPUOperation(std::move(kernel)),
-      weights_(std::move(kernel.weights_)),
-      biases_(std::move(kernel.biases_)),
       kernel_(std::move(kernel.kernel_)),
       work_group_size_(kernel.work_group_size_) {}
 
 FullyConnected& FullyConnected::operator=(FullyConnected&& kernel) {
   if (this != &kernel) {
-    weights_ = std::move(kernel.weights_);
-    biases_ = std::move(kernel.biases_);
     kernel_ = std::move(kernel.kernel_);
     std::swap(work_group_size_, kernel.work_group_size_);
     GPUOperation::operator=(std::move(kernel));
@@ -120,8 +110,14 @@ absl::Status FullyConnected::Compile(const CreationContext& creation_context) {
   do {
     work_group_size_ = {wg_width, wg_height, 1};
     wg_width /= 2;
-    const auto code = GetFullyConnectedKernelCode(
-        definition_, biases_, linked_operations_, work_group_size_);
+    std::string code =
+        GetFullyConnectedKernelCode(definition_, work_group_size_, &args_);
+    std::string element_wise_code;
+    RETURN_IF_ERROR(
+        MergeOperations(linked_operations_, &args_, &element_wise_code));
+    RETURN_IF_ERROR(args_.TransformToCLCode(creation_context.device->GetInfo(),
+                                            {{"dst_tensor", element_wise_code}},
+                                            &code));
     auto status = creation_context.cache->GetOrCreateCLKernel(
         code, "main_function", *creation_context.context,
         *creation_context.device, &kernel_);
@@ -138,14 +134,10 @@ absl::Status FullyConnected::Compile(const CreationContext& creation_context) {
 }
 
 absl::Status FullyConnected::AddToQueue(CLCommandQueue* queue) {
-  kernel_.ResetBindingCounter();
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(weights_.GetMemoryPtr()));
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(biases_.GetMemoryPtr()));
-  RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtrForWriting()));
-  RETURN_IF_ERROR(
-      kernel_.SetBytesAuto(int2(src_[0]->Slices(), dst_[0]->Slices())));
+  RETURN_IF_ERROR(args_.SetObjectRef("src_tensor", src_[0]));
+  RETURN_IF_ERROR(args_.SetObjectRef("dst_tensor", dst_[0]));
+  RETURN_IF_ERROR(SetArguments(linked_operations_, &args_));
+  RETURN_IF_ERROR(args_.Bind(kernel_.kernel()));
   return queue->DispatchImplicit(kernel_, {dst_[0]->Slices(), 1, 1},
                                  work_group_size_);
 }
@@ -157,13 +149,18 @@ absl::Status CreateFullyConnected(const CreationContext& creation_context,
   *result = FullyConnected(definition);
   RETURN_IF_ERROR(
       result->UploadWeights(attr.weights, creation_context.context));
-  LinearStorageCreateInfo create_info;
-  create_info.storage_type = LinearStorageType::TEXTURE_2D;
-  create_info.data_type = definition.GetDataType();
-  create_info.name = "biases";
-  create_info.aligned_size = attr.weights.shape.o;
-  RETURN_IF_ERROR(CreateLinearStorage(
-      create_info, attr.bias, creation_context.context, &result->biases_));
+
+  TensorLinearDescriptor desc;
+  desc.storage_type = LinearStorageType::TEXTURE_2D;
+  desc.element_type = definition.GetDataType();
+
+  LinearStorage lt;
+  RETURN_IF_ERROR(
+      CreateLinearStorage(desc, attr.bias, creation_context.context, &lt));
+  result->args_.AddObject("biases", AccessType::READ,
+                          absl::make_unique<LinearStorage>(std::move(lt)),
+                          absl::make_unique<TensorLinearDescriptor>(desc));
+
   return absl::OkStatus();
 }
 
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/fully_connected.h b/tensorflow/lite/delegates/gpu/cl/kernels/fully_connected.h
index 0be5288ed3a..83490b281ab 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/fully_connected.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/fully_connected.h
@@ -61,8 +61,6 @@ class FullyConnected : public GPUOperation {
   void RearrangeWeights(const tflite::gpu::Tensor<OHWI, T>& weights,
                         absl::Span<S> dst);
 
-  Buffer weights_;
-  LinearStorage biases_;
   CLKernel kernel_;
   int3 work_group_size_ = int3(0, 0, 0);
 };
@@ -78,17 +76,30 @@ absl::Status FullyConnected::UploadWeights(
 
   const int float4_size = f32_weights ? 16 : 8;
 
-  if (definition_.GetDataType() == DataType::FLOAT32) {
+  BufferDescriptor desc;
+  desc.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
+  desc.element_size = 16;
+
+  Buffer weights_buffer;
+  if (f32_weights) {
     std::vector<float4> gpu_data(dst_depth * src_depth * 4);
     RearrangeWeights(weights, absl::MakeSpan(gpu_data));
-    return CreateReadOnlyBuffer(float4_size * elements_count, gpu_data.data(),
-                                context, &weights_);
+    RETURN_IF_ERROR(CreateReadOnlyBuffer(float4_size * elements_count,
+                                         gpu_data.data(), context,
+                                         &weights_buffer));
   } else {
     std::vector<half4> gpu_data(dst_depth * src_depth * 4);
     RearrangeWeights(weights, absl::MakeSpan(gpu_data));
-    return CreateReadOnlyBuffer(float4_size * elements_count, gpu_data.data(),
-                                context, &weights_);
+    RETURN_IF_ERROR(CreateReadOnlyBuffer(float4_size * elements_count,
+                                         gpu_data.data(), context,
+                                         &weights_buffer));
   }
+
+  args_.AddObject("weights", AccessType::READ,
+                  absl::make_unique<Buffer>(std::move(weights_buffer)),
+                  absl::make_unique<BufferDescriptor>(desc));
+
+  return absl::OkStatus();
 }
 
 template <DataType T, typename S>
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/lstm.cc b/tensorflow/lite/delegates/gpu/cl/kernels/lstm.cc
index 77eea07f278..4732d35e987 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/lstm.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/lstm.cc
@@ -26,39 +26,34 @@ namespace gpu {
 namespace cl {
 namespace {
 
-std::string GetLSTMCode(const OperationDef& op_def, const CLDevice& device) {
-  const WHSBPoint state_size{"1", "1", "state_size.z", "state_size.w"};
-  const WHSBPoint src_size{"1", "1", "src_size.z", "src_size.w"};
-
-  TensorCodeGenerator intermediate("src_data", src_size, op_def.src_tensors[0]);
-  TensorCodeGenerator prev_state("prev_state", state_size,
-                                 op_def.src_tensors[1]);
-
-  TensorCodeGenerator activation("dst_data", state_size, op_def.dst_tensors[0]);
-  TensorCodeGenerator new_state("new_state", state_size, op_def.dst_tensors[1]);
+std::string GetLSTMCode(const OperationDef& op_def, const CLDevice& device,
+                        Arguments* args) {
+  args->AddObjectRef(
+      "intermediate", AccessType::READ,
+      absl::make_unique<TensorDescriptor>(op_def.src_tensors[0]));
+  args->AddObjectRef(
+      "prev_state", AccessType::READ,
+      absl::make_unique<TensorDescriptor>(op_def.src_tensors[1]));
+  args->AddObjectRef(
+      "new_state", AccessType::WRITE,
+      absl::make_unique<TensorDescriptor>(op_def.dst_tensors[0]));
+  args->AddObjectRef(
+      "activation", AccessType::WRITE,
+      absl::make_unique<TensorDescriptor>(op_def.dst_tensors[1]));
 
   std::string c = GetCommonDefines(op_def.precision);
-
   c += "__kernel void main_function(\n";
-  c += intermediate.GetDeclaration(AccessType::READ) + ",\n";
-  c += prev_state.GetDeclaration(AccessType::READ) + ",\n";
-  c += new_state.GetDeclaration(AccessType::WRITE) + ",\n";
-  c += activation.GetDeclaration(AccessType::WRITE) + ",\n";
-  c += "    int4 src_size,             \n";
-  c += "    int4 state_size,           \n";
-  c += "    int BATCH_SIZE             \n";
-  c += ") {\n";
+  c += "$0) {\n";
   c += "  int B = get_global_id(0);\n";
   c += "  int Z = get_global_id(1);\n";
-  c += "  if (Z >= state_size.z || B >= state_size.w) return;\n";
-  c += "  FLT4 prev_st = " + prev_state.ReadWHSB("0", "0", "Z", "B") + ";\n";
-  c += "  FLT4 r0 = " + intermediate.ReadWHSB("0", "0", "Z", "B") + ";\n";
-  c += "  FLT4 r1 = " +
-       intermediate.ReadWHSB("0", "0", "Z + state_size.z", "B") + ";\n";
-  c += "  FLT4 r2 = " +
-       intermediate.ReadWHSB("0", "0", "Z + state_size.z * 2", "B") + ";\n";
-  c += "  FLT4 r3 = " +
-       intermediate.ReadWHSB("0", "0", "Z + state_size.z * 3", "B") + ";\n";
+  c += "  if (Z >= args.activation.Slices() || B >= args.activation.Batch()) "
+       "return;\n";
+  c += "  FLT4 prev_st = args.prev_state.Read(0, 0, Z, B);\n";
+  c += "  FLT4 r0 = args.intermediate.Read(0, 0, Z, B);\n";
+  c += "  int state_stride = args.activation.Slices();\n";
+  c += "  FLT4 r1 = args.intermediate.Read(0, 0, Z + state_stride, B);\n";
+  c += "  FLT4 r2 = args.intermediate.Read(0, 0, Z + state_stride * 2, B);\n";
+  c += "  FLT4 r3 = args.intermediate.Read(0, 0, Z + state_stride * 3, B);\n";
   if (op_def.precision != CalculationsPrecision::F32 && device.IsAdreno()) {
     c += "  FLT4 input_gate;\n";
     c += "  FLT4 new_input;\n";
@@ -97,9 +92,9 @@ std::string GetLSTMCode(const OperationDef& op_def, const CLDevice& device) {
         "* r3));\n";
   }
   c += "  FLT4 new_st = input_gate * new_input + forget_gate * prev_st;\n";
-  c += "  FLT4 activation = output_gate * tanh(new_st);\n";
-  c += "  " + activation.WriteWHSB("activation", "0", "0", "Z", "B");
-  c += "  " + new_state.WriteWHSB("new_st", "0", "0", "Z", "B");
+  c += "  FLT4 act_value = output_gate * tanh(new_st);\n";
+  c += "  args.activation.Write(act_value, 0, 0, Z, B);\n";
+  c += "  args.new_state.Write(new_st, 0, 0, Z, B);\n";
   c += "}\n";
   return c;
 }
@@ -122,22 +117,20 @@ LSTM& LSTM::operator=(LSTM&& kernel) {
 }
 
 absl::Status LSTM::Compile(const CreationContext& creation_context) {
-  const auto code = GetLSTMCode(definition_, *creation_context.device);
+  std::string code = GetLSTMCode(definition_, *creation_context.device, &args_);
+  RETURN_IF_ERROR(
+      args_.TransformToCLCode(creation_context.device->GetInfo(), {}, &code));
   return creation_context.cache->GetOrCreateCLKernel(
       code, "main_function", *creation_context.context,
       *creation_context.device, &kernel_);
 }
 
 absl::Status LSTM::BindArguments() {
-  kernel_.ResetBindingCounter();
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[1]->GetMemoryPtr()));
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtrForWriting()));
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[1]->GetMemoryPtrForWriting()));
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetWHSB()));
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetWHSB()));
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->Batch()));
-  return absl::OkStatus();
+  RETURN_IF_ERROR(args_.SetObjectRef("intermediate", src_[0]));
+  RETURN_IF_ERROR(args_.SetObjectRef("prev_state", src_[1]));
+  RETURN_IF_ERROR(args_.SetObjectRef("new_state", dst_[0]));
+  RETURN_IF_ERROR(args_.SetObjectRef("activation", dst_[1]));
+  return args_.Bind(kernel_.kernel());
 }
 
 int3 LSTM::GetGridSize() const {
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/max_unpooling.cc b/tensorflow/lite/delegates/gpu/cl/kernels/max_unpooling.cc
index 56109fc713b..dc16837102a 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/max_unpooling.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/max_unpooling.cc
@@ -25,178 +25,136 @@ namespace gpu {
 namespace cl {
 namespace {
 
-std::string GetMaxUnpoolingKernelCode(
-    const OperationDef& op_def, const CLDevice& device,
-    const std::vector<ElementwiseOperation*>& linked_operations) {
-  TensorCodeGenerator src("src_data",
-                          WHSPoint{"src_size.x", "src_size.y", "src_size.z"},
-                          op_def.src_tensors[0]);
-  TensorCodeGenerator src_ind(
-      "src_data_indices", WHSPoint{"src_size.x", "src_size.y", "src_size.z"},
-      op_def.src_tensors[1]);
-  TensorCodeGenerator dst("dst_data",
-                          WHSPoint{"dst_size.x", "dst_size.y", "dst_size.z"},
-                          op_def.dst_tensors[0]);
-
-  const auto address_mode = GetFastestZeroMode(device);
+std::string GetMaxUnpoolingKernelCode(const OperationDef& op_def,
+                                      const CLDevice& device, Arguments* args) {
+  auto src_desc = absl::make_unique<TensorDescriptor>(op_def.src_tensors[0]);
+  src_desc->SetTextureAddressMode(GetFastestZeroMode(device));
+  if (op_def.IsBatchSupported()) {
+    src_desc->SetStateVar("BatchedWidth", "true");
+  }
+  args->AddObjectRef("src_tensor", AccessType::READ, std::move(src_desc));
+  auto src_ind_desc =
+      absl::make_unique<TensorDescriptor>(op_def.src_tensors[1]);
+  src_ind_desc->SetTextureAddressMode(GetFastestZeroMode(device));
+  if (op_def.IsBatchSupported()) {
+    src_ind_desc->SetStateVar("BatchedWidth", "true");
+  }
+  args->AddObjectRef("src_indices", AccessType::READ, std::move(src_ind_desc));
+  auto dst_desc = absl::make_unique<TensorDescriptor>(op_def.dst_tensors[0]);
+  if (op_def.IsBatchSupported()) {
+    dst_desc->SetStateVar("BatchedWidth", "true");
+  }
+  args->AddObjectRef("dst_tensor", AccessType::WRITE, std::move(dst_desc));
+  if (op_def.dst_tensors[0].HasAxis(Axis::WIDTH)) {
+    args->AddInt("kernel_size_x");
+    args->AddInt("padding_x");
+    args->AddInt("stride_x");
+  }
+  if (op_def.dst_tensors[0].HasAxis(Axis::HEIGHT)) {
+    args->AddInt("kernel_size_y");
+    args->AddInt("padding_y");
+    args->AddInt("stride_y");
+  }
+  if (op_def.dst_tensors[0].HasAxis(Axis::DEPTH)) {
+    args->AddInt("kernel_size_z");
+    args->AddInt("padding_z");
+    args->AddInt("stride_z");
+  }
 
   std::string c = GetCommonDefines(op_def.precision);
-
   c += "__kernel void main_function(\n";
-  c += src.GetDeclaration(AccessType::READ) + ",\n";
-  c += src_ind.GetDeclaration(AccessType::READ);
-  c += GetArgsDeclaration(linked_operations);
-  c += dst.GetDeclaration(AccessType::WRITE) + ",\n";
-  c += "    int4 src_size,      \n";
-  c += "    int4 dst_size,      \n";
-  c += "    int2 kernel_size,   \n";
-  c += "    int2 padding,       \n";
-  c += "    int2 stride         \n";
-  c += ") {\n";
+  c += "$0) {\n";
   c += "  int X = get_global_id(0);\n";
-  c += "  int Y = get_global_id(1);\n";
-  c += "  int Z = get_global_id(2);\n";
-  c += "  if (X >= dst_size.x || Y >= dst_size.y || Z >= dst_size.z) return;\n";
-  if (op_def.IsBatchSupported()) {
-    c += "  int linear_id = get_global_id(0);\n";
-    c += "  int X0 = linear_id / dst_size.w;\n";
-    c += "  int B = linear_id % dst_size.w;\n";
-    c += "  int src_x0 = (X0 + padding.x) / stride.x;\n";
-    c += "  int src_x = src_x0 * dst_size.w + B;\n";
+  if (op_def.dst_tensors[0].HasAxis(Axis::DEPTH)) {
+    c += "  int linear_id_1 = get_global_id(1);\n";
+    c += "  int Y = linear_id_1 / args.dst_tensor.Depth();\n";
+    c += "  int Z = linear_id_1 % args.dst_tensor.Depth();\n";
+    c += "  int src_z = (Z + args.padding_z) / args.stride_z;\n";
   } else {
-    c += "  int src_x = (X + padding.x) / stride.x;\n";
+    c += "  int Y = get_global_id(1);\n";
   }
-  c += "  int src_y = (Y + padding.y) / stride.y;\n";
-  c += "  " + src.GetAddressWHS("src_adr", "src_x", "src_y", "Z") + "\n";
+  c += "  int S = get_global_id(2);\n";
+  c += "  if (X >= args.dst_tensor.Width() || Y >= args.dst_tensor.Height() || "
+       "S >= args.dst_tensor.Slices()) { \n";
+  c += "    return; \n";
+  c += "  } \n";
+  if (op_def.dst_tensors[0].HasAxis(Axis::BATCH)) {
+    c += "  int linear_id_0 = get_global_id(0);\n";
+    c += "  int X0 = linear_id_0 / args.dst_tensor.Batch();\n";
+    c += "  int B = linear_id_0 % args.dst_tensor.Batch();\n";
+    c += "  int src_x0 = (X0 + args.padding_x) / args.stride_x;\n";
+    c += "  int src_x = src_x0 * args.dst_tensor.Batch() + B;\n";
+  } else {
+    c += "  int src_x = (X + args.padding_x) / args.stride_x;\n";
+  }
+  c += "  int src_y = (Y + args.padding_y) / args.stride_y;\n";
+  std::string src_args = op_def.dst_tensors[0].HasAxis(Axis::DEPTH)
+                             ? "src_x, src_y, src_z, S"
+                             : "src_x, src_y, S";
   if (op_def.src_tensors[0].storage_type == TensorStorageType::BUFFER) {
-    c += "  bool outside = src_x < 0 || src_y < 0 ||";
-    c += "  src_x >= src_size.x || src_y >= src_size.y;\n";
+    if (op_def.dst_tensors[0].HasAxis(Axis::DEPTH)) {
+      c += "  bool outside = src_x < 0 || src_y < 0 || src_z < 0 || src_x >= "
+           "args.src_tensor.Width() || src_y >= args.src_tensor.Height() || "
+           "src_z >= args.src_tensor.Depth();\n";
+    } else {
+      c += "  bool outside = src_x < 0 || src_y < 0 || src_x >= "
+           "args.src_tensor.Width() || src_y >= args.src_tensor.Height();\n";
+    }
     c += "  FLT4 src = (FLT4)(0.0f);\n";
     c += "  int4 ind = (int4)(0);\n";
     c += "  if (!outside) {\n";
-    c += "    src = " + src.Read("src_adr") + ";\n";
-    c += "    ind = convert_int4(" + src_ind.Read("src_adr") + ");\n";
+    c += "    src = args.src_tensor.Read(" + src_args + ");\n";
+    c += "    ind = convert_int4(args.src_indices.Read(" + src_args + "));\n";
     c += "  }\n";
   } else {
-    c += "  FLT4 src = " + src.Read("src_adr", address_mode) + ";\n";
-    c += "  int4 ind = convert_int4(" + src_ind.Read("src_adr", address_mode) +
-         ");\n";
+    c += "  FLT4 src = args.src_tensor.Read(" + src_args + ");\n";
+    c +=
+        "  int4 ind = convert_int4(args.src_indices.Read(" + src_args + "));\n";
   }
-  if (op_def.IsBatchSupported()) {
-    c += "  int t_x = X0 - (src_x0 * stride.x - padding.x);\n";
+  if (op_def.dst_tensors[0].HasAxis(Axis::BATCH)) {
+    c += "  int t_x = X0 - (src_x0 * args.stride_x - args.padding_x);\n";
   } else {
-    c += "  int t_x = X - (src_x * stride.x - padding.x);\n";
+    c += "  int t_x = X - (src_x * args.stride_x - args.padding_x);\n";
+  }
+  c += "  int t_y = Y - (src_y * args.stride_y - args.padding_y);\n";
+  if (op_def.dst_tensors[0].HasAxis(Axis::DEPTH)) {
+    c += "  int t_z = Z - (src_z * args.stride_z - args.padding_z);\n";
+    c += "  int t_index = (t_y * args.kernel_size_x + t_x) * "
+         "args.kernel_size_z + t_z;\n";
+  } else {
+    c += "  int t_index = t_y * args.kernel_size_x + t_x;\n";
   }
-  c += "  int t_y = Y - (src_y * stride.y - padding.y);\n";
-  c += "  int t_index = t_y * kernel_size.x + t_x;\n";
   c += "  FLT4 result;\n";
   const std::string channels[] = {".x", ".y", ".z", ".w"};
   for (int i = 0; i < 4; ++i) {
     const auto& s = channels[i];
     c += "  result" + s + "= t_index == ind" + s + "? src" + s + ": 0.0f;\n";
   }
-  c += PostProcess(linked_operations, {"result", "X", "Y", "Z"});
-  c += "  " + dst.WriteWHS("result", "X", "Y", "Z");
+  if (op_def.dst_tensors[0].HasAxis(Axis::DEPTH)) {
+    c += "  args.dst_tensor.Write(result, X, Y, Z, S);\n";
+  } else {
+    c += "  args.dst_tensor.Write(result, X, Y, S);\n";
+  }
   c += "}\n";
 
   return c;
 }
-
-std::string GetMaxUnpooling3DKernelCode(
-    const OperationDef& op_def, const CLDevice& device,
-    const std::vector<ElementwiseOperation*>& linked_operations) {
-  TensorCodeGenerator src(
-      "src_data",
-      WHDSPoint{"src_size.x", "src_size.y", "src_size.z", "src_size.w"},
-      op_def.src_tensors[0]);
-  TensorCodeGenerator src_ind(
-      "src_data_indices",
-      WHDSPoint{"src_size.x", "src_size.y", "src_size.z", "src_size.w"},
-      op_def.src_tensors[1]);
-  TensorCodeGenerator dst(
-      "dst_data",
-      WHDSPoint{"dst_size.x", "dst_size.y", "dst_size.z", "dst_size.w"},
-      op_def.dst_tensors[0]);
-
-  const auto address_mode = GetFastestZeroMode(device);
-
-  std::string c = GetCommonDefines(op_def.precision);
-
-  c += "__kernel void main_function(\n";
-  c += src.GetDeclaration(AccessType::READ) + ",\n";
-  c += src_ind.GetDeclaration(AccessType::READ);
-  c += GetArgsDeclaration(linked_operations);
-  c += dst.GetDeclaration(AccessType::WRITE) + ",\n";
-  c += "    int4 src_size,      \n";
-  c += "    int4 dst_size,      \n";
-  if (op_def.IsBatchSupported()) {
-    c += "    int batch_size,          \n";
-  }
-  c += "    int4 kernel_size,   \n";
-  c += "    int4 padding,       \n";
-  c += "    int4 stride         \n";
-  c += ") {\n";
-  c += "  int X = get_global_id(0);\n";
-  c += "  int Y = get_global_id(1);\n";
-  c += "  int linear_id_z = get_global_id(2);\n";
-  c += "  int S = linear_id_z % dst_size.w;\n";
-  c += "  int Z = linear_id_z / dst_size.w;\n";
-  c += "  if (X >= dst_size.x || Y >= dst_size.y || Z >= dst_size.z) return;\n";
-  if (op_def.IsBatchSupported()) {
-    c += "  int linear_id = get_global_id(0);\n";
-    c += "  int X0 = linear_id / batch_size;\n";
-    c += "  int B = linear_id % batch_size;\n";
-    c += "  int src_x0 = (X0 + padding.x) / stride.x;\n";
-    c += "  int src_x = src_x0 * batch_size + B;\n";
-  } else {
-    c += "  int src_x = (X + padding.x) / stride.x;\n";
-  }
-  c += "  int src_y = (Y + padding.y) / stride.y;\n";
-  c += "  int src_z = (Z + padding.z) / stride.z;\n";
-  c += "  " + src.GetAddressWHDS("src_adr", "src_x", "src_y", "src_z", "S") +
-       "\n";
-  if (op_def.src_tensors[0].storage_type == TensorStorageType::BUFFER) {
-    c += "  bool outside = src_x < 0 || src_y < 0 || src_z < 0 || ";
-    c += "  src_x >= src_size.x || src_y >= src_size.y || src_z >= "
-         "src_size.z;\n";
-    c += "  FLT4 src = (FLT4)(0.0f);\n";
-    c += "  int4 ind = (int4)(0);\n";
-    c += "  if (!outside) {\n";
-    c += "    src = " + src.Read("src_adr") + ";\n";
-    c += "    ind = convert_int4(" + src_ind.Read("src_adr") + ");\n";
-    c += "  }\n";
-  } else {
-    c += "  FLT4 src = " + src.Read("src_adr", address_mode) + ";\n";
-    c += "  int4 ind = convert_int4(" + src_ind.Read("src_adr", address_mode) +
-         ");\n";
-  }
-  if (op_def.IsBatchSupported()) {
-    c += "  int t_x = X0 - (src_x0 * stride.x - padding.x);\n";
-  } else {
-    c += "  int t_x = X - (src_x * stride.x - padding.x);\n";
-  }
-  c += "  int t_y = Y - (src_y * stride.y - padding.y);\n";
-  c += "  int t_z = Z - (src_z * stride.z - padding.z);\n";
-  c += "  int t_index = (t_y * kernel_size.x + t_x) * kernel_size.z + t_z;\n";
-  c += "  FLT4 result;\n";
-  const std::string channels[] = {".x", ".y", ".z", ".w"};
-  for (int i = 0; i < 4; ++i) {
-    const auto& s = channels[i];
-    c += "  result" + s + " = t_index == ind" + s + " ? src" + s + ": 0.0f;\n";
-  }
-  c += PostProcess(linked_operations, {"result", "X", "Y", "S"});
-  c += "  " + dst.WriteWHDS("result", "X", "Y", "Z", "S");
-  c += "}\n";
-  return c;
-}
 }  // namespace
 
 MaxUnpooling::MaxUnpooling(const OperationDef& definition,
                            const MaxUnpooling2DAttributes& attr)
     : GPUOperation(definition),
-      stride_(attr.strides.w, attr.strides.h),
-      padding_(attr.padding.appended.w, attr.padding.appended.h),
-      kernel_size_(attr.kernel.w, attr.kernel.h) {}
+      stride_(attr.strides.w, attr.strides.h, 0, 0),
+      padding_(attr.padding.appended.w, attr.padding.appended.h, 0, 0),
+      kernel_size_(attr.kernel.w, attr.kernel.h, 0, 0) {}
+
+MaxUnpooling::MaxUnpooling(const OperationDef& definition,
+                           const MaxUnpooling3DAttributes& attr)
+    : GPUOperation(definition),
+      stride_(attr.strides.w, attr.strides.h, attr.strides.d, 0),
+      padding_(attr.padding.appended.w, attr.padding.appended.h,
+               attr.padding.appended.d, 0),
+      kernel_size_(attr.kernel.w, attr.kernel.h, attr.kernel.d, 0) {}
 
 MaxUnpooling::MaxUnpooling(MaxUnpooling&& kernel)
     : GPUOperation(std::move(kernel)),
@@ -219,30 +177,45 @@ MaxUnpooling& MaxUnpooling::operator=(MaxUnpooling&& kernel) {
 }
 
 absl::Status MaxUnpooling::Compile(const CreationContext& creation_context) {
-  const auto code = GetMaxUnpoolingKernelCode(
-      definition_, *creation_context.device, linked_operations_);
+  std::string code =
+      GetMaxUnpoolingKernelCode(definition_, *creation_context.device, &args_);
+  std::string element_wise_code;
+  RETURN_IF_ERROR(
+      MergeOperations(linked_operations_, &args_, &element_wise_code));
+  RETURN_IF_ERROR(args_.TransformToCLCode(creation_context.device->GetInfo(),
+                                          {{"dst_tensor", element_wise_code}},
+                                          &code));
   return creation_context.cache->GetOrCreateCLKernel(
       code, "main_function", *creation_context.context,
       *creation_context.device, &kernel_);
 }
 
 absl::Status MaxUnpooling::BindArguments() {
-  kernel_.ResetBindingCounter();
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[1]->GetMemoryPtr()));
-  RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtrForWriting()));
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetWBatchedHSB()));
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetWBatchedHSB()));
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(kernel_size_));
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(padding_));
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(stride_));
-  return absl::OkStatus();
+  RETURN_IF_ERROR(args_.SetObjectRef("src_tensor", src_[0]));
+  RETURN_IF_ERROR(args_.SetObjectRef("src_indices", src_[1]));
+  RETURN_IF_ERROR(args_.SetObjectRef("dst_tensor", dst_[0]));
+  if (definition_.dst_tensors[0].HasAxis(Axis::WIDTH)) {
+    RETURN_IF_ERROR(args_.SetInt("stride_x", stride_.x));
+    RETURN_IF_ERROR(args_.SetInt("padding_x", padding_.x * src_[0]->Batch()));
+    RETURN_IF_ERROR(args_.SetInt("kernel_size_x", kernel_size_.x));
+  }
+  if (definition_.dst_tensors[0].HasAxis(Axis::HEIGHT)) {
+    RETURN_IF_ERROR(args_.SetInt("stride_y", stride_.y));
+    RETURN_IF_ERROR(args_.SetInt("padding_y", padding_.y));
+    RETURN_IF_ERROR(args_.SetInt("kernel_size_y", kernel_size_.y));
+  }
+  if (definition_.dst_tensors[0].HasAxis(Axis::DEPTH)) {
+    RETURN_IF_ERROR(args_.SetInt("stride_z", stride_.z));
+    RETURN_IF_ERROR(args_.SetInt("padding_z", padding_.z));
+    RETURN_IF_ERROR(args_.SetInt("kernel_size_z", kernel_size_.z));
+  }
+  RETURN_IF_ERROR(SetArguments(linked_operations_, &args_));
+  return args_.Bind(kernel_.kernel());
 }
 
 int3 MaxUnpooling::GetGridSize() const {
   const int grid_x = dst_[0]->Width() * dst_[0]->Batch();
-  const int grid_y = dst_[0]->Height();
+  const int grid_y = dst_[0]->Height() * dst_[0]->Depth();
   const int grid_z = dst_[0]->Slices();
   return int3(grid_x, grid_y, grid_z);
 }
@@ -262,82 +235,9 @@ MaxUnpooling CreateMaxUnpooling(const OperationDef& definition,
   return MaxUnpooling(definition, attr);
 }
 
-MaxUnpooling3D::MaxUnpooling3D(const OperationDef& definition,
-                               const MaxUnpooling3DAttributes& attr)
-    : GPUOperation(definition),
-      stride_(attr.strides.w, attr.strides.h, attr.strides.d),
-      padding_(attr.padding.appended.w, attr.padding.appended.h,
-               attr.padding.appended.d),
-      kernel_size_(attr.kernel.w, attr.kernel.h, attr.kernel.d) {}
-
-MaxUnpooling3D::MaxUnpooling3D(MaxUnpooling3D&& kernel)
-    : GPUOperation(std::move(kernel)),
-      stride_(kernel.stride_),
-      padding_(kernel.padding_),
-      kernel_size_(kernel.kernel_size_),
-      kernel_(std::move(kernel.kernel_)),
-      work_group_size_(kernel.work_group_size_) {}
-
-MaxUnpooling3D& MaxUnpooling3D::operator=(MaxUnpooling3D&& kernel) {
-  if (this != &kernel) {
-    std::swap(stride_, kernel.stride_);
-    std::swap(padding_, kernel.padding_);
-    std::swap(kernel_size_, kernel.kernel_size_);
-    kernel_ = std::move(kernel.kernel_);
-    std::swap(work_group_size_, kernel.work_group_size_);
-    GPUOperation::operator=(std::move(kernel));
-  }
-  return *this;
-}
-
-absl::Status MaxUnpooling3D::Compile(const CreationContext& creation_context) {
-  const auto code = GetMaxUnpooling3DKernelCode(
-      definition_, *creation_context.device, linked_operations_);
-  return creation_context.cache->GetOrCreateCLKernel(
-      code, "main_function", *creation_context.context,
-      *creation_context.device, &kernel_);
-}
-
-absl::Status MaxUnpooling3D::BindArguments() {
-  kernel_.ResetBindingCounter();
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[1]->GetMemoryPtr()));
-  RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtrForWriting()));
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetWBatchedHDS()));
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetWBatchedHDS()));
-  if (definition_.IsBatchSupported()) {
-    RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->Batch()));
-  }
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(
-      int4(kernel_size_.x, kernel_size_.y, kernel_size_.z, 1)));
-  RETURN_IF_ERROR(
-      kernel_.SetBytesAuto(int4(padding_.x, padding_.y, padding_.z, 1)));
-  RETURN_IF_ERROR(
-      kernel_.SetBytesAuto(int4(stride_.x, stride_.y, stride_.z, 1)));
-  return absl::OkStatus();
-}
-
-int3 MaxUnpooling3D::GetGridSize() const {
-  const int grid_x = dst_[0]->Width() * dst_[0]->Batch();
-  const int grid_y = dst_[0]->Height();
-  const int grid_z = dst_[0]->Slices() * dst_[0]->Depth();
-  return int3(grid_x, grid_y, grid_z);
-}
-
-absl::Status MaxUnpooling3D::Tune(const TuningParameters& params) {
-  RETURN_IF_ERROR(BindArguments());
-  return GetBestWorkGroup(params, kernel_, GetGridSize(), &work_group_size_);
-}
-
-absl::Status MaxUnpooling3D::AddToQueue(CLCommandQueue* queue) {
-  RETURN_IF_ERROR(BindArguments());
-  return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
-}
-
-MaxUnpooling3D CreateMaxUnpooling3D(const OperationDef& definition,
-                                    const MaxUnpooling3DAttributes& attr) {
-  return MaxUnpooling3D(definition, attr);
+MaxUnpooling CreateMaxUnpooling(const OperationDef& definition,
+                                const MaxUnpooling3DAttributes& attr) {
+  return MaxUnpooling(definition, attr);
 }
 
 }  // namespace cl
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/max_unpooling.h b/tensorflow/lite/delegates/gpu/cl/kernels/max_unpooling.h
index 19184ee1e89..24b8c4bbfe3 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/max_unpooling.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/max_unpooling.h
@@ -29,6 +29,8 @@ class MaxUnpooling : public GPUOperation {
  public:
   MaxUnpooling(const OperationDef& definition,
                const MaxUnpooling2DAttributes& attr);
+  MaxUnpooling(const OperationDef& definition,
+               const MaxUnpooling3DAttributes& attr);
   absl::Status AddToQueue(CLCommandQueue* queue) override;
   absl::Status Tune(const TuningParameters& params) override;
 
@@ -44,9 +46,9 @@ class MaxUnpooling : public GPUOperation {
   absl::Status BindArguments();
   int3 GetGridSize() const;
 
-  int2 stride_;
-  int2 padding_;
-  int2 kernel_size_;
+  int4 stride_;
+  int4 padding_;
+  int4 kernel_size_;
 
   CLKernel kernel_;
   int3 work_group_size_ = int3(8, 4, 1);
@@ -55,35 +57,8 @@ class MaxUnpooling : public GPUOperation {
 MaxUnpooling CreateMaxUnpooling(const OperationDef& definition,
                                 const MaxUnpooling2DAttributes& attr);
 
-class MaxUnpooling3D : public GPUOperation {
- public:
-  MaxUnpooling3D(const OperationDef& definition,
-                 const MaxUnpooling3DAttributes& attr);
-  absl::Status AddToQueue(CLCommandQueue* queue) override;
-  absl::Status Tune(const TuningParameters& params) override;
-
-  absl::Status Compile(const CreationContext& creation_context) override;
-
-  // Move only
-  MaxUnpooling3D(MaxUnpooling3D&& kernel);
-  MaxUnpooling3D& operator=(MaxUnpooling3D&& kernel);
-  MaxUnpooling3D(const MaxUnpooling3D&) = delete;
-  MaxUnpooling3D& operator=(const MaxUnpooling3D&) = delete;
-
- private:
-  absl::Status BindArguments();
-  int3 GetGridSize() const;
-
-  int3 stride_;
-  int3 padding_;
-  int3 kernel_size_;
-
-  CLKernel kernel_;
-  int3 work_group_size_ = int3(8, 4, 1);
-};
-
-MaxUnpooling3D CreateMaxUnpooling3D(const OperationDef& definition,
-                                    const MaxUnpooling3DAttributes& attr);
+MaxUnpooling CreateMaxUnpooling(const OperationDef& definition,
+                                const MaxUnpooling3DAttributes& attr);
 
 }  // namespace cl
 }  // namespace gpu
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/mean.cc b/tensorflow/lite/delegates/gpu/cl/kernels/mean.cc
index f79a30e33dd..3f8fb5ee648 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/mean.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/mean.cc
@@ -27,41 +27,46 @@ namespace gpu {
 namespace cl {
 namespace {
 
-std::string GetMeanKernelCode(
-    const OperationDef& op_def,
-    const std::vector<ElementwiseOperation*>& linked_operations,
-    const int3& work_group_size) {
-  TensorCodeGenerator src_tensor(
-      "src_data", WHSPoint{"src_size.x", "src_size.y", "src_size.z"},
-      op_def.src_tensors[0]);
-  TensorCodeGenerator dst_tensor("dst_data", WHSPoint{"1", "1", "src_size.z"},
-                                 op_def.dst_tensors[0]);
+std::string GetMeanKernelCode(const OperationDef& op_def,
+                              const int3& work_group_size, Arguments* args) {
+  args->AddObjectRef(
+      "src_tensor", AccessType::READ,
+      absl::make_unique<TensorDescriptor>(op_def.src_tensors[0]));
+  args->AddObjectRef(
+      "dst_tensor", AccessType::WRITE,
+      absl::make_unique<TensorDescriptor>(op_def.dst_tensors[0]));
+  args->AddFloat("inv_multiplier_1");
+  args->AddFloat("inv_multiplier_2");
 
   std::string c = GetCommonDefines(op_def.precision);
   const std::string wg_x = std::to_string(work_group_size.x);
   const std::string wg_y = std::to_string(work_group_size.y);
   c += "__kernel void main_function(\n";
-  c += src_tensor.GetDeclaration(AccessType::READ);
-  c += GetArgsDeclaration(linked_operations);
-  c += dst_tensor.GetDeclaration(AccessType::WRITE) + ",\n";
-  c += "    int4 src_size,           \n";
-  c += "    float2 inv_multipliers   \n";
-  c += ") {\n";
+  c += "$0) {\n";
   c += "  __local float4 accum[" +
        std::to_string(work_group_size.x * work_group_size.y) + "];\n";
   c += "  int local_x = get_local_id(0);\n";
   c += "  int local_y = get_local_id(1);\n";
   c += "  int local_id = local_y * " + wg_x + " + local_x;\n";
-  c += "  int S = get_global_id(2);\n";
-  c += "  if (S >= src_size.z) return;\n";
+  if (op_def.dst_tensors[0].HasAxis(Axis::BATCH)) {
+    c += "  int linear_id_2 = get_global_id(2);\n";
+    c += "  int S = linear_id_2 / args.dst_tensor.Batch();\n";
+    c += "  int B = linear_id_2 % args.dst_tensor.Batch();\n";
+    c += "  args.dst_tensor.SetBatchRef(B);\n";
+    c += "  args.src_tensor.SetBatchRef(B);\n";
+  } else {
+    c += "  int S = get_global_id(2);\n";
+  }
+  c += "  if (S >= args.dst_tensor.Slices()) return;\n";
   c += "  accum[local_id] = (float4)(0.0f);\n";
-  c += "  for (int s_y = local_y; s_y < src_size.y; s_y += " + wg_y + ") {\n";
-  c += "    for (int s_x = local_x; s_x < src_size.x; s_x += " + wg_x + ") {\n";
-  c += "        accum[local_id] += " +
-       src_tensor.ReadAsFloatWHS("s_x", "s_y", "S") + ";\n";
+  c += "  for (int s_y = local_y; s_y < args.src_tensor.Height(); s_y += " +
+       wg_y + ") {\n";
+  c += "    for (int s_x = local_x; s_x < args.src_tensor.Width(); s_x += " +
+       wg_x + ") {\n";
+  c += "      accum[local_id] += args.src_tensor.Read<float>(s_x, s_y, S);\n";
   c += "    }\n";
   c += "  }\n";
-  c += "  accum[local_id] *= inv_multipliers.x;\n";
+  c += "  accum[local_id] *= args.inv_multiplier_1;\n";
   c += "  barrier(CLK_LOCAL_MEM_FENCE);\n";
   const int total_size = work_group_size.x * work_group_size.y;
   int offset = 1;
@@ -81,9 +86,8 @@ std::string GetMeanKernelCode(
   for (int i = 1; i < reminder; ++i) {
     c += "  sum += accum[" + std::to_string(offset * i) + "];\n";
   }
-  c += "  FLT4 result = TO_FLT4(sum * inv_multipliers.y);\n";
-  c += PostProcess(linked_operations, {"result", "0", "0", "S"});
-  c += "  " + dst_tensor.WriteWHS("result", "0", "0", "S");
+  c += "  FLT4 result = TO_FLT4(sum * args.inv_multiplier_2);\n";
+  c += "  args.dst_tensor.Write(result, 0, 0, S);\n";
   c += "}\n";
   return c;
 }
@@ -107,30 +111,34 @@ absl::Status Mean::Compile(const CreationContext& creation_context) {
   if (creation_context.device->IsAdreno3xx()) {
     work_group_size_ = int3(16, 8, 1);
   }
-  const auto code =
-      GetMeanKernelCode(definition_, linked_operations_, work_group_size_);
+  std::string code = GetMeanKernelCode(definition_, work_group_size_, &args_);
+  std::string element_wise_code;
+  RETURN_IF_ERROR(
+      MergeOperations(linked_operations_, &args_, &element_wise_code));
+  RETURN_IF_ERROR(args_.TransformToCLCode(creation_context.device->GetInfo(),
+                                          {{"dst_tensor", element_wise_code}},
+                                          &code));
   return creation_context.cache->GetOrCreateCLKernel(
       code, "main_function", *creation_context.context,
       *creation_context.device, &kernel_);
 }
 
 absl::Status Mean::BindArguments() {
-  kernel_.ResetBindingCounter();
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
-  RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtrForWriting()));
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetWBatchedHSB()));
+  RETURN_IF_ERROR(args_.SetObjectRef("src_tensor", src_[0]));
+  RETURN_IF_ERROR(args_.SetObjectRef("dst_tensor", dst_[0]));
   const double total_size = src_[0]->Width() * src_[0]->Height();
   const double size_0 = work_group_size_.x * work_group_size_.y;
   const double size_1 = total_size / size_0;
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(float2(1.0 / size_1, 1.0 / size_0)));
-  return absl::OkStatus();
+  RETURN_IF_ERROR(args_.SetFloat("inv_multiplier_1", 1.0 / size_1));
+  RETURN_IF_ERROR(args_.SetFloat("inv_multiplier_2", 1.0 / size_0));
+  RETURN_IF_ERROR(SetArguments(linked_operations_, &args_));
+  return args_.Bind(kernel_.kernel());
 }
 
 int3 Mean::GetGridSize() const {
-  const int grid_x = work_group_size_.x * dst_[0]->Batch();
+  const int grid_x = work_group_size_.x;
   const int grid_y = work_group_size_.y;
-  const int grid_z = dst_[0]->Slices();
+  const int grid_z = dst_[0]->Slices() * dst_[0]->Batch();
   return int3(grid_x, grid_y, grid_z);
 }
 
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/padding.cc b/tensorflow/lite/delegates/gpu/cl/kernels/padding.cc
index 48edcb448a1..883067a2c2d 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/padding.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/padding.cc
@@ -26,20 +26,21 @@ namespace gpu {
 namespace cl {
 namespace {
 
-std::string GetPaddingCode(
-    const OperationDef& op_def,
-    const std::vector<ElementwiseOperation*>& linked_operations,
-    const PadAttributes& attr) {
-  TensorCodeGenerator src_tensor(
-      "src_data",
-      WHSBPoint{"src_size.x", "src_size.y", "src_size.z", "src_size.w"},
-      op_def.src_tensors[0]);
-  TensorCodeGenerator dst_tensor(
-      "dst_data",
-      WHSBPoint{"dst_size.x", "dst_size.y", "dst_size.z", "dst_size.w"},
-      op_def.dst_tensors[0]);
+std::string GetPaddingCode(const OperationDef& op_def,
+                           const PadAttributes& attr, Arguments* args) {
+  args->AddObjectRef(
+      "src_tensor", AccessType::READ,
+      absl::make_unique<TensorDescriptor>(op_def.src_tensors[0]));
+  args->AddObjectRef(
+      "dst_tensor", AccessType::WRITE,
+      absl::make_unique<TensorDescriptor>(op_def.dst_tensors[0]));
+  args->AddInt("prepended_x");
+  args->AddInt("prepended_y");
+  args->AddInt("prepended_z");
+  args->AddInt("prepended_w");
 
-  const std::string dst_batch = op_def.IsBatchSupported() ? "B" : "";
+  const std::string dst_batch =
+      op_def.dst_tensors[0].HasAxis(Axis::BATCH) ? "B" : "0";
   std::string c = GetCommonDefines(op_def.precision);
   const std::string channels[] = {".x", ".y", ".z", ".w"};
 
@@ -51,76 +52,70 @@ std::string GetPaddingCode(
   }
 
   c += "__kernel void main_function(\n";
-  c += src_tensor.GetDeclaration(AccessType::READ);
-  c += GetArgsDeclaration(linked_operations);
-  c += dst_tensor.GetDeclaration(AccessType::WRITE) + ",\n";
-  c += "    int4 src_size,      \n";
-  c += "    int src_channels,   \n";
-  c += "    int4 dst_size,      \n";
-  c += "    int4 prepended      \n";
-  c += ") {\n";
-  if (op_def.IsBatchSupported()) {
+  c += "$0) {\n";
+  if (op_def.dst_tensors[0].HasAxis(Axis::BATCH)) {
     c += "  int linear_id = get_global_id(0);\n";
-    c += "  int X = linear_id / dst_size.w;\n";
-    c += "  int B = linear_id % dst_size.w;\n";
+    c += "  int X = linear_id / args.dst_tensor.Batch();\n";
+    c += "  int B = linear_id % args.dst_tensor.Batch();\n";
+    c += "  args.dst_tensor.SetBatchRef(B);\n";
   } else {
     c += "  int X = get_global_id(0);\n";
   }
   c += "  int Y = get_global_id(1);\n";
   c += "  int Z = get_global_id(2);\n";
-  c += "  if (X >= dst_size.x || Y >= dst_size.y || Z >= dst_size.z) return;\n";
+  c += "  if (X >= args.dst_tensor.Width() || Y >= args.dst_tensor.Height() || "
+       "Z >= args.dst_tensor.Slices()) { \n";
+  c += "    return; \n";
+  c += "  } \n";
   c += "  FLT4 result = (FLT4)(0.0);\n";
-  c += "  int s_x = X - prepended.x;\n";
-  c += "  int s_y = Y - prepended.y;\n";
-  if (op_def.IsBatchSupported()) {
-    c += "  int s_b = B - prepended.w;\n";
+  c += "  int s_x = X - args.prepended_x;\n";
+  c += "  int s_y = Y - args.prepended_y;\n";
+  if (op_def.src_tensors[0].HasAxis(Axis::BATCH)) {
+    c += "  int s_b = " + dst_batch + " - args.prepended_w;\n";
+    c += "  args.src_tensor.SetBatchRef(s_b);\n";
   }
-  const std::string src_batch = op_def.IsBatchSupported() ? "s_b" : "";
   if (attr.type == PaddingContentType::REFLECT) {
-    c += "  s_x = reflect(s_x, src_size.x);\n";
-    c += "  s_y = reflect(s_y, src_size.y);\n";
-    if (op_def.IsBatchSupported()) {
-      c += "  int s_b = reflect(s_b, src_size.w);\n";
+    c += "  s_x = reflect(s_x, args.src_tensor.Width());\n";
+    c += "  s_y = reflect(s_y, args.src_tensor.Height());\n";
+    if (op_def.src_tensors[0].HasAxis(Axis::BATCH)) {
+      c += "  int s_b = reflect(s_b, args.src_tensor.Batch());\n";
     }
     if (attr.prepended.c == 0 && attr.appended.c == 0) {
       // optimized case
-      c += "  result = " + src_tensor.ReadWHSB("s_x", "s_y", "Z", src_batch) +
-           ";\n";
+      c += "  result = args.src_tensor.Read(s_x, s_y, Z);\n";
     } else {
       c += "  int start_channel = Z * 4;\n";
       for (int i = 0; i < 4; ++i) {
         const auto& s = channels[i];
         c += "  {\n";
         c += "    int channel = start_channel + " + std::to_string(i) + ";\n";
-        c += "    int s_z = channel - prepended.z;\n";
+        c += "    int s_z = channel - args.prepended_z;\n";
         // We need additional clamp for z, so that we use alignment for channels
         // and can proceed extra channels that can lead to reading out of
         // resource.
-        c += "    s_z = clamp(reflect(s_z, src_channels), 0, src_channels - "
+        c += "    s_z = clamp(reflect(s_z, args.src_tensor.Channels()), 0, "
+             "args.src_tensor.Channels() - "
              "1);\n";
-        c += "    FLT4 t = " +
-             src_tensor.ReadWHSB("s_x", "s_y", "s_z / 4", src_batch) + ";\n";
+        c += "    FLT4 t = args.src_tensor.Read(s_x, s_y, s_z / 4);\n";
         c += "    FLT t_ar[4] = {t.x, t.y, t.z, t.w};\n";
         c += "    result" + s + " = t_ar[s_z % 4];\n";
         c += "  }\n";
       }
     }
   } else {
-    c += "  bool inside_x = s_x >= 0 && s_x < src_size.x;\n";
-    c += "  bool inside_y = s_y >= 0 && s_y < src_size.y;\n";
-    if (op_def.IsBatchSupported()) {
-      c += "  inside_y &= (s_b >= 0 && s_b < src_size.w);\n";
+    c += "  bool inside_x = s_x >= 0 && s_x < args.src_tensor.Width();\n";
+    c += "  bool inside_y = s_y >= 0 && s_y < args.src_tensor.Height();\n";
+    if (op_def.src_tensors[0].HasAxis(Axis::BATCH)) {
+      c += "  inside_y &= (s_b >= 0 && s_b < args.src_tensor.Batch());\n";
     }
     c += "  if (inside_x && inside_y) {\n";
     if (attr.prepended.c == 0 && attr.appended.c == 0) {
       // optimized case
-      c += "    result = " + src_tensor.ReadWHSB("s_x", "s_y", "Z", src_batch) +
-           ";\n";
+      c += "    result = args.src_tensor.Read(s_x, s_y, Z);\n";
     } else if (attr.prepended.c % 4 == 0) {
-      c += "    int s_z = Z - prepended.z / 4;\n";
-      c += "    if (s_z >= 0 && s_z < src_size.z) {\n";
-      c += "      result = " +
-           src_tensor.ReadWHSB("s_x", "s_y", "s_z", src_batch) + ";\n";
+      c += "    int s_z = Z - args.prepended_z / 4;\n";
+      c += "    if (s_z >= 0 && s_z < args.src_tensor.Slices()) {\n";
+      c += "      result = args.src_tensor.Read(s_x, s_y, s_z);\n";
       c += "    }\n";
     } else {
       c += "    int start_channel = Z * 4;\n";
@@ -128,10 +123,9 @@ std::string GetPaddingCode(
         const auto& s = channels[i];
         c += "    {\n";
         c += "    int channel = start_channel + " + std::to_string(i) + ";\n";
-        c += "    int s_z = channel - prepended.z;\n";
-        c += "    if (s_z >= 0 && s_z < src_channels) {\n";
-        c += "      FLT4 t = " +
-             src_tensor.ReadWHSB("s_x", "s_y", "s_z / 4", src_batch) + ";\n";
+        c += "    int s_z = channel - args.prepended_z;\n";
+        c += "    if (s_z >= 0 && s_z < args.src_tensor.Channels()) {\n";
+        c += "      FLT4 t = args.src_tensor.Read(s_x, s_y, s_z / 4);\n";
         c += "      FLT t_ar[4] = {t.x, t.y, t.z, t.w};\n";
         c += "      result" + s + " = t_ar[s_z % 4];\n";
         c += "    }\n";
@@ -140,10 +134,7 @@ std::string GetPaddingCode(
     }
     c += "  }\n";
   }
-  std::string x_3dcoord =
-      op_def.IsBatchSupported() ? "X * dst_size.w + B" : "X";
-  c += PostProcess(linked_operations, {"result", x_3dcoord, "Y", "Z"});
-  c += "  " + dst_tensor.WriteWHSB("result", "X", "Y", "Z", dst_batch);
+  c += "  args.dst_tensor.Write(result, X, Y, Z);\n";
   c += "}\n";
 
   return c;
@@ -170,24 +161,27 @@ Padding& Padding::operator=(Padding&& kernel) {
 }
 
 absl::Status Padding::Compile(const CreationContext& creation_context) {
-  const auto code =
-      GetPaddingCode(definition_, linked_operations_, attributes_);
+  std::string code = GetPaddingCode(definition_, attributes_, &args_);
+  std::string element_wise_code;
+  RETURN_IF_ERROR(
+      MergeOperations(linked_operations_, &args_, &element_wise_code));
+  RETURN_IF_ERROR(args_.TransformToCLCode(creation_context.device->GetInfo(),
+                                          {{"dst_tensor", element_wise_code}},
+                                          &code));
   return creation_context.cache->GetOrCreateCLKernel(
       code, "main_function", *creation_context.context,
       *creation_context.device, &kernel_);
 }
 
 absl::Status Padding::BindArguments() {
-  kernel_.ResetBindingCounter();
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
-  RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtrForWriting()));
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetWHSB()));
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->Channels()));
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetWHSB()));
-  const auto& prep = attributes_.prepended;
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(int4(prep.w, prep.h, prep.c, prep.b)));
-  return absl::OkStatus();
+  RETURN_IF_ERROR(args_.SetObjectRef("src_tensor", src_[0]));
+  RETURN_IF_ERROR(args_.SetObjectRef("dst_tensor", dst_[0]));
+  RETURN_IF_ERROR(args_.SetInt("prepended_x", attributes_.prepended.w));
+  RETURN_IF_ERROR(args_.SetInt("prepended_y", attributes_.prepended.h));
+  RETURN_IF_ERROR(args_.SetInt("prepended_z", attributes_.prepended.c));
+  RETURN_IF_ERROR(args_.SetInt("prepended_w", attributes_.prepended.b));
+  RETURN_IF_ERROR(SetArguments(linked_operations_, &args_));
+  return args_.Bind(kernel_.kernel());
 }
 
 int3 Padding::GetGridSize() const {
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/pooling.cc b/tensorflow/lite/delegates/gpu/cl/kernels/pooling.cc
index e292f2dad7d..922d484c57d 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/pooling.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/pooling.cc
@@ -25,366 +25,307 @@ namespace gpu {
 namespace cl {
 namespace {
 
-std::string GetAveragePoolingKernelCode(
-    const OperationDef& op_def, bool stride_correction, const CLDevice& device,
-    const std::vector<ElementwiseOperation*>& linked_operations) {
-  TensorCodeGenerator src_tensor(
-      "src_data", WHSPoint{"src_size.x", "src_size.y", "src_size.z"},
-      op_def.src_tensors[0]);
-  TensorCodeGenerator dst_tensor(
-      "dst_data", WHSPoint{"dst_size.x", "dst_size.y", "dst_size.z"},
-      op_def.dst_tensors[0]);
+std::string GetAveragePoolingKernelCode(const OperationDef& op_def,
+                                        bool stride_correction,
+                                        const CLDevice& device,
+                                        Arguments* args) {
+  auto src_desc = absl::make_unique<TensorDescriptor>(op_def.src_tensors[0]);
+  src_desc->SetTextureAddressMode(GetFastestZeroMode(device));
+  if (op_def.IsBatchSupported()) {
+    src_desc->SetStateVar("BatchedWidth", "true");
+  }
+  args->AddObjectRef("src_tensor", AccessType::READ, std::move(src_desc));
+  auto dst_desc = absl::make_unique<TensorDescriptor>(op_def.dst_tensors[0]);
+  if (op_def.IsBatchSupported()) {
+    dst_desc->SetStateVar("BatchedWidth", "true");
+  }
+  args->AddObjectRef("dst_tensor", AccessType::WRITE, std::move(dst_desc));
+  if (op_def.dst_tensors[0].HasAxis(Axis::WIDTH)) {
+    args->AddInt("kernel_size_x");
+    args->AddInt("padding_x");
+    args->AddInt("stride_x");
+  }
+  if (op_def.dst_tensors[0].HasAxis(Axis::HEIGHT)) {
+    args->AddInt("kernel_size_y");
+    args->AddInt("padding_y");
+    args->AddInt("stride_y");
+  }
+  if (op_def.dst_tensors[0].HasAxis(Axis::DEPTH)) {
+    args->AddInt("kernel_size_z");
+    args->AddInt("padding_z");
+    args->AddInt("stride_z");
+  }
 
-  const auto address_mode = GetFastestZeroMode(device);
+  std::map<Axis, std::string> axis_to_src_coord = {
+      {Axis::WIDTH, "x_c"},  {Axis::HEIGHT, "y_c"}, {Axis::DEPTH, "d_c"},
+      {Axis::CHANNELS, "Z"}, {Axis::BATCH, "B"},
+  };
 
-  std::string c = GetCommonDefines(op_def.precision);
+  std::map<Axis, std::string> axis_to_dst_coord = {
+      {Axis::WIDTH, "X"},    {Axis::HEIGHT, "Y"}, {Axis::DEPTH, "D"},
+      {Axis::CHANNELS, "Z"}, {Axis::BATCH, "B"},
+  };
+
+  std::vector<std::string> src_coords;
+  std::vector<std::string> dst_coords;
+  for (auto axis : {Axis::WIDTH, Axis::HEIGHT, Axis::DEPTH, Axis::CHANNELS}) {
+    if (op_def.dst_tensors[0].HasAxis(axis)) {
+      dst_coords.push_back(axis_to_dst_coord[axis]);
+    }
+    if (op_def.src_tensors[0].HasAxis(axis)) {
+      src_coords.push_back(axis_to_src_coord[axis]);
+    }
+  }
+  std::string src_coord = src_coords[0];
+  for (int i = 1; i < src_coords.size(); ++i) {
+    src_coord += ", " + src_coords[i];
+  }
+  std::string dst_coord = dst_coords[0];
+  for (int i = 1; i < dst_coords.size(); ++i) {
+    dst_coord += ", " + dst_coords[i];
+  }
 
   const bool manual_clamp =
       op_def.src_tensors[0].storage_type == TensorStorageType::BUFFER ||
       op_def.src_tensors[0].storage_type == TensorStorageType::IMAGE_BUFFER;
 
+  std::string c = GetCommonDefines(op_def.precision);
   c += "__kernel void main_function(\n";
-  c += src_tensor.GetDeclaration(AccessType::READ);
-  c += GetArgsDeclaration(linked_operations);
-  c += dst_tensor.GetDeclaration(AccessType::WRITE) + ",\n";
-  c += "    int4 src_size,             \n";
-  c += "    int4 dst_size,             \n";
-  c += "    int2 kernel_size,          \n";
-  c += "    int2 padding,              \n";
-  c += "    int2 stride                \n";
-  c += ") {\n";
+  c += "$0) {\n";
   c += "  int X = get_global_id(0);\n";
-  c += "  int Y = get_global_id(1);\n";
+  if (op_def.dst_tensors[0].HasAxis(Axis::DEPTH)) {
+    c += "  int linear_id_1 = get_global_id(1);\n";
+    c += "  int Y = linear_id_1 / args.dst_tensor.Depth();\n";
+    c += "  int D = linear_id_1 % args.dst_tensor.Depth();\n";
+  } else {
+    c += "  int Y = get_global_id(1);\n";
+  }
   c += "  int Z = get_global_id(2);\n";
-  c += "  if (X >= dst_size.x || Y >= dst_size.y || Z >= dst_size.z) return;\n";
+  c += "  if (X >= args.dst_tensor.Width() || Y >= args.dst_tensor.Height() || "
+       "Z >= args.dst_tensor.Slices()) { \n";
+  c += "    return; \n";
+  c += "  } \n";
   c += "  float4 r = (float4)(0.0f);\n";
   c += "  float window_size = 0.0;\n";
   if (stride_correction) {
     c += "  int xs = " +
-         GetXStrideCorrected("X", "src_size.w", "stride.x", "padding.x") +
+         GetXStrideCorrected("X", "args.src_tensor.Batch()", "args.stride_x",
+                             "args.padding_x") +
          ";\n";
   } else {
-    c += "  int xs = X * stride.x + padding.x;\n";
+    c += "  int xs = X * args.stride_x + args.padding_x;\n";
   }
-  c += "  int ys = Y * stride.y + padding.y;\n";
-  c += "  for (int ky = 0; ky < kernel_size.y; ++ky) {\n";
+  c += "  int ys = Y * args.stride_y + args.padding_y;\n";
+  if (op_def.dst_tensors[0].HasAxis(Axis::DEPTH)) {
+    c += "  int ds = D * args.stride_z + args.padding_z;\n";
+    c += "  for (int kz = 0; kz < args.kernel_size_z; ++kz) {\n";
+    c += "    int d_c = ds + kz;\n";
+    c += "    if (d_c < 0 || d_c >= args.src_tensor.Depth()) continue;\n";
+  }
+  c += "  for (int ky = 0; ky < args.kernel_size_y; ++ky) {\n";
   c += "    int y_c = ys + ky;\n";
-  c += "    bool outside_y = y_c < 0 || y_c >= src_size.y;\n";
-  c += "    for (int kx = 0; kx < kernel_size.x; ++kx) {\n";
+  c += "    bool outside_y = y_c < 0 || y_c >= args.src_tensor.Height();\n";
+  c += "    for (int kx = 0; kx < args.kernel_size_x; ++kx) {\n";
   if (op_def.IsBatchSupported()) {
-    c += "      int x_c = xs + kx * src_size.w;\n";
+    c += "      int x_c = xs + kx * args.src_tensor.Batch();\n";
   } else {
     c += "      int x_c = xs + kx;\n";
   }
-  c += "      bool outside = outside_y || x_c < 0 || x_c >= src_size.x;\n";
+  c += "      bool outside = outside_y || x_c < 0 || x_c >= "
+       "args.src_tensor.Width();\n";
   if (manual_clamp) {
-    c += "     r += !outside ? " +
-         src_tensor.ReadAsFloatWHS("x_c", "y_c", "Z") + " : (float4)(0.0f);\n";
+    c += "     r += !outside ? args.src_tensor.Read<float>(" + src_coord +
+         ") : "
+         "(float4)(0.0f);\n";
   } else {
-    c += "      r += " +
-         src_tensor.ReadAsFloatWHS("x_c", "y_c", "Z", address_mode) + ";\n";
+    c += "      r += args.src_tensor.Read<float>(" + src_coord + ");\n";
   }
   c += "        window_size += !outside ? 1.0 : 0.0;\n";
   c += "    }\n";
   c += "  }\n";
+  if (op_def.dst_tensors[0].HasAxis(Axis::DEPTH)) {
+    c += "  }  // Depth\n";
+  }
   // If window_size==0, window covered nothing. This situation is a sign of
   // incorrectly constructed operation. NaNs are expected as output.
   c += "  FLT4 result = TO_FLT4(r / window_size);\n";
-  const LinkingContext context{"result", "X", "Y", "Z"};
-  c += PostProcess(linked_operations, context);
-  c += "  " + dst_tensor.WriteWHS("result", "X", "Y", "Z");
+  c += "  args.dst_tensor.Write(result, " + dst_coord + ");\n";
   c += "}\n";
 
   return c;
 }
 
-std::string GetAveragePooling3DKernelCode(
-    const OperationDef& op_def, bool stride_correction, const CLDevice& device,
-    const std::vector<ElementwiseOperation*>& linked_operations) {
-  TensorCodeGenerator src_tensor(
-      "src_data",
-      WHDSPoint{"src_size.x", "src_size.y", "src_size.z", "src_size.w"},
-      op_def.src_tensors[0]);
-  TensorCodeGenerator dst_tensor(
-      "dst_data",
-      WHDSPoint{"dst_size.x", "dst_size.y", "dst_size.z", "dst_size.w"},
-      op_def.dst_tensors[0]);
-
-  const auto address_mode = GetFastestZeroMode(device);
-
-  std::string c = GetCommonDefines(op_def.precision);
-
-  c += "__kernel void main_function(\n";
-  c += src_tensor.GetDeclaration(AccessType::READ);
-  c += GetArgsDeclaration(linked_operations);
-  c += dst_tensor.GetDeclaration(AccessType::WRITE) + ",\n";
-  c += "    int4 src_size,             \n";
-  c += "    int4 dst_size,             \n";
+std::string GetMaxPoolingKernelCode(const OperationDef& op_def,
+                                    bool stride_correction, bool output_indices,
+                                    Arguments* args) {
+  auto src_desc = absl::make_unique<TensorDescriptor>(op_def.src_tensors[0]);
   if (op_def.IsBatchSupported()) {
-    c += "    int batch_size,          \n";
+    src_desc->SetStateVar("BatchedWidth", "true");
   }
-  c += "    int4 kernel_size,          \n";
-  c += "    int4 padding,              \n";
-  c += "    int4 stride                \n";
-  c += ") {\n";
-  c += "  int X = get_global_id(0);\n";
-  c += "  int Y = get_global_id(1);\n";
-  c += "  int linear_id_z = get_global_id(2);\n";
-  c += "  int S = linear_id_z % dst_size.w;\n";
-  c += "  int Z = linear_id_z / dst_size.w;\n";
-  c += "  if (X >= dst_size.x || Y >= dst_size.y || Z >= dst_size.z) return;\n";
-  c += "  float4 r = (float4)(0.0f);\n";
-  c += "  float window_size = 0.0;\n";
-  if (stride_correction) {
-    c += "  int xs = " +
-         GetXStrideCorrected("X", "batch_size", "stride.x", "padding.x") +
-         ";\n";
-  } else {
-    c += "  int xs = X * stride.x + padding.x;\n";
-  }
-  c += "  int ys = Y * stride.y + padding.y;\n";
-  c += "  int zs = Z * stride.z + padding.z;\n";
-  c += "  for (int kz = 0; kz < kernel_size.z; ++kz) {\n";
-  c += "    int z_c = zs + kz;\n";
-  c += "    if (z_c < 0 || z_c >= src_size.z) continue;\n";
-  c += "    for (int ky = 0; ky < kernel_size.y; ++ky) {\n";
-  c += "      int y_c = ys + ky;\n";
-  c += "      if (y_c < 0 || y_c >= src_size.y) continue;\n";
-  c += "      for (int kx = 0; kx < kernel_size.x; ++kx) {\n";
+  args->AddObjectRef("src_tensor", AccessType::READ, std::move(src_desc));
+  auto dst_desc = absl::make_unique<TensorDescriptor>(op_def.dst_tensors[0]);
   if (op_def.IsBatchSupported()) {
-    c += "        int x_c = xs + kx * batch_size;\n";
-  } else {
-    c += "        int x_c = xs + kx;\n";
+    dst_desc->SetStateVar("BatchedWidth", "true");
   }
-  c += "        if(x_c < 0 || x_c >= src_size.x) continue;\n";
-  c += "        r += " +
-       src_tensor.ReadAsFloatWHDS("x_c", "y_c", "z_c", "S", address_mode) +
-       ";\n";
-  c += "        window_size += 1.0;\n";
-  c += "      }\n";
-  c += "    }\n";
-  c += "  }\n";
-  // If window_size==0, window covered nothing. This situation is a sign of
-  // incorrectly constructed operation. NaNs are expected as output.
-  c += "  FLT4 result = TO_FLT4(r / window_size);\n";
-  const LinkingContext context{"result", "X", "Y", "Z"};
-  c += PostProcess(linked_operations, context);
-  c += "  " + dst_tensor.WriteWHDS("result", "X", "Y", "Z", "S");
-  c += "}\n";
-
-  return c;
-}
-
-std::string GetMaxPoolingKernelCode(
-    const OperationDef& op_def, bool stride_correction,
-    const std::vector<ElementwiseOperation*>& linked_operations,
-    bool output_indices) {
-  TensorCodeGenerator src_tensor(
-      "src_data", WHSPoint{"src_size.x", "src_size.y", "src_size.z"},
-      op_def.src_tensors[0]);
-  TensorCodeGenerator dst_tensor(
-      "dst_data", WHSPoint{"dst_size.x", "dst_size.y", "dst_size.z"},
-      op_def.dst_tensors[0]);
-  const auto dst_ind_def =
-      output_indices ? op_def.dst_tensors[1] : op_def.dst_tensors[0];
-  TensorCodeGenerator indices_tensor(
-      "dst_indices", WHSPoint{"dst_size.x", "dst_size.y", "dst_size.z"},
-      dst_ind_def);
-
-  std::string c = GetCommonDefines(op_def.precision);
-
-  c += "__kernel void main_function(\n";
-  c += src_tensor.GetDeclaration(AccessType::READ);
-  c += GetArgsDeclaration(linked_operations);
-  c += dst_tensor.GetDeclaration(AccessType::WRITE) + ",\n";
+  args->AddObjectRef("dst_tensor", AccessType::WRITE, std::move(dst_desc));
   if (output_indices) {
-    c += indices_tensor.GetDeclaration(AccessType::WRITE) + ",\n";
+    auto dst_ind_desc =
+        absl::make_unique<TensorDescriptor>(op_def.dst_tensors[1]);
+    if (op_def.IsBatchSupported()) {
+      dst_ind_desc->SetStateVar("BatchedWidth", "true");
+    }
+    args->AddObjectRef("dst_indices", AccessType::WRITE,
+                       std::move(dst_ind_desc));
   }
-  c += "    int4 src_size,             \n";
-  c += "    int4 dst_size,             \n";
-  c += "    int2 kernel_size,          \n";
-  c += "    int2 padding,              \n";
-  c += "    int2 stride                \n";
-  c += ") {\n";
+  if (op_def.dst_tensors[0].HasAxis(Axis::WIDTH)) {
+    args->AddInt("kernel_size_x");
+    args->AddInt("padding_x");
+    args->AddInt("stride_x");
+  }
+  if (op_def.dst_tensors[0].HasAxis(Axis::HEIGHT)) {
+    args->AddInt("kernel_size_y");
+    args->AddInt("padding_y");
+    args->AddInt("stride_y");
+  }
+  if (op_def.dst_tensors[0].HasAxis(Axis::DEPTH)) {
+    args->AddInt("kernel_size_z");
+    args->AddInt("padding_z");
+    args->AddInt("stride_z");
+  }
+
+  std::map<Axis, std::string> axis_to_src_coord = {
+      {Axis::WIDTH, "x_c"},  {Axis::HEIGHT, "y_c"}, {Axis::DEPTH, "d_c"},
+      {Axis::CHANNELS, "Z"}, {Axis::BATCH, "B"},
+  };
+
+  std::map<Axis, std::string> axis_to_dst_coord = {
+      {Axis::WIDTH, "X"},    {Axis::HEIGHT, "Y"}, {Axis::DEPTH, "D"},
+      {Axis::CHANNELS, "Z"}, {Axis::BATCH, "B"},
+  };
+
+  std::vector<std::string> src_coords;
+  std::vector<std::string> dst_coords;
+  for (auto axis : {Axis::WIDTH, Axis::HEIGHT, Axis::DEPTH, Axis::CHANNELS}) {
+    if (op_def.dst_tensors[0].HasAxis(axis)) {
+      dst_coords.push_back(axis_to_dst_coord[axis]);
+    }
+    if (op_def.src_tensors[0].HasAxis(axis)) {
+      src_coords.push_back(axis_to_src_coord[axis]);
+    }
+  }
+  std::string src_coord = src_coords[0];
+  for (int i = 1; i < src_coords.size(); ++i) {
+    src_coord += ", " + src_coords[i];
+  }
+  std::string dst_coord = dst_coords[0];
+  for (int i = 1; i < dst_coords.size(); ++i) {
+    dst_coord += ", " + dst_coords[i];
+  }
+
+  std::string c = GetCommonDefines(op_def.precision);
+  c += "__kernel void main_function(\n";
+  c += "$0) {\n";
   c += "  int X = get_global_id(0);\n";
-  c += "  int Y = get_global_id(1);\n";
+  if (op_def.dst_tensors[0].HasAxis(Axis::DEPTH)) {
+    c += "  int linear_id_1 = get_global_id(1);\n";
+    c += "  int Y = linear_id_1 / args.dst_tensor.Depth();\n";
+    c += "  int D = linear_id_1 % args.dst_tensor.Depth();\n";
+  } else {
+    c += "  int Y = get_global_id(1);\n";
+  }
   c += "  int Z = get_global_id(2);\n";
-  c +=
-      "  if (X >= dst_size.x || Y >= dst_size.y || Z >= dst_size.z) return; \n";
-  c += "  FLT4 maximum = (FLT4)(-10000.0f);\n";
-  if (output_indices) {
-    c += "  FLT4 indexes = (FLT4)(0.0f);\n";
-    c += "  FLT index_counter = (FLT)(0.1f);\n";
-  }
-  if (stride_correction) {
-    c += "  int xs = " +
-         GetXStrideCorrected("X", "src_size.w", "stride.x", "padding.x") +
-         ";\n";
-  } else {
-    c += "  int xs = X * stride.x + padding.x;\n";
-  }
-  c += "  int ys = Y * stride.y + padding.y;\n";
-  c += "  for (int ky = 0; ky < kernel_size.y; ++ky) {\n";
-  c += "    int y_c = ys + ky;\n";
-  c += "    bool outside_y = y_c < 0 || y_c >= src_size.y;\n";
-  c += "    for (int kx = 0; kx < kernel_size.x; ++kx) {\n";
-  if (op_def.IsBatchSupported()) {
-    c += "      int x_c = xs + kx * src_size.w;\n";
-  } else {
-    c += "      int x_c = xs + kx;\n";
-  }
-  c += "      bool outside_x = x_c < 0 || x_c >= src_size.x;\n";
-  c += "      if (!outside_x && !outside_y) {\n";
-  c += "        FLT4 src = " + src_tensor.ReadWHS("x_c", "y_c", "Z") + ";\n";
-  if (output_indices) {
-    c += "        if (src.x > maximum.x) {\n";
-    c += "          indexes.x = index_counter;\n";
-    c += "          maximum.x = src.x;\n";
-    c += "        }\n";
-    c += "        if (src.y > maximum.y) {\n";
-    c += "          indexes.y = index_counter;\n";
-    c += "          maximum.y = src.y;\n";
-    c += "        }\n";
-    c += "        if (src.z > maximum.z) {\n";
-    c += "          indexes.z = index_counter;\n";
-    c += "          maximum.z = src.z;\n";
-    c += "        }\n";
-    c += "        if (src.w > maximum.w) {\n";
-    c += "          indexes.w = index_counter;\n";
-    c += "          maximum.w = src.w;\n";
-    c += "        }\n";
-    c += "        index_counter += (FLT)(1.0f);\n";
-  } else {
-    c += "        maximum = max(src, maximum);\n";
-  }
-  c += "      }\n";
-  c += "    }\n";
-  c += "  }\n";
-  const LinkingContext context{"maximum", "X", "Y", "Z"};
-  c += PostProcess(linked_operations, context);
-  c += "  " + dst_tensor.WriteWHS("maximum", "X", "Y", "Z");
-  if (output_indices) {
-    c += "  " + indices_tensor.WriteWHS("indexes", "X", "Y", "Z");
-  }
-  c += "}\n";
-
-  return c;
-}
-
-std::string GetMaxPooling3DKernelCode(
-    const OperationDef& op_def, bool stride_correction,
-    const std::vector<ElementwiseOperation*>& linked_operations,
-    bool output_indices) {
-  TensorCodeGenerator src_tensor(
-      "src_data",
-      WHDSPoint{"src_size.x", "src_size.y", "src_size.z", "src_size.w"},
-      op_def.src_tensors[0]);
-  TensorCodeGenerator dst_tensor(
-      "dst_data",
-      WHDSPoint{"dst_size.x", "dst_size.y", "dst_size.z", "dst_size.w"},
-      op_def.dst_tensors[0]);
-  const auto dst_ind_def =
-      output_indices ? op_def.dst_tensors[1] : op_def.dst_tensors[0];
-  TensorCodeGenerator indices_tensor(
-      "dst_indices",
-      WHDSPoint{"dst_size.x", "dst_size.y", "dst_size.z", "dst_size.w"},
-      dst_ind_def);
-
-  std::string c = GetCommonDefines(op_def.precision);
-
-  c += "__kernel void main_function(\n";
-  c += src_tensor.GetDeclaration(AccessType::READ);
-  c += GetArgsDeclaration(linked_operations);
-  c += dst_tensor.GetDeclaration(AccessType::WRITE) + ",\n";
-  if (output_indices) {
-    c += indices_tensor.GetDeclaration(AccessType::WRITE) + ",\n";
-  }
-  c += "    int4 src_size,             \n";
-  c += "    int4 dst_size,             \n";
-  if (op_def.IsBatchSupported()) {
-    c += "    int batch_size,          \n";
-  }
-  c += "    int4 kernel_size,          \n";
-  c += "    int4 padding,              \n";
-  c += "    int4 stride                \n";
-  c += ") {\n";
-  c += "  int X = get_global_id(0);\n";
-  c += "  int Y = get_global_id(1);\n";
-  c += "  int linear_id_z = get_global_id(2);\n";
-  c += "  int S = linear_id_z % dst_size.w;\n";
-  c += "  int Z = linear_id_z / dst_size.w;\n";
-  c += "  if (X >= dst_size.x || Y >= dst_size.y || Z >= dst_size.z) return;\n";
+  c += "  if (X >= args.dst_tensor.Width() || Y >= args.dst_tensor.Height() || "
+       "Z >= args.dst_tensor.Slices()) { \n";
+  c += "    return; \n";
+  c += "  } \n";
   c += "  FLT4 maximum = (FLT4)(-10000.0f);\n";
   if (output_indices) {
     c += "  FLT4 indexes = (FLT4)(0.0f);\n";
   }
   if (stride_correction) {
     c += "  int xs = " +
-         GetXStrideCorrected("X", "batch_size", "stride.x", "padding.x") +
+         GetXStrideCorrected("X", "args.src_tensor.Batch()", "args.stride_x",
+                             "args.padding_x") +
          ";\n";
   } else {
-    c += "  int xs = X * stride.x + padding.x;\n";
+    c += "  int xs = X * args.stride_x + args.padding_x;\n";
   }
-  c += "  int ys = Y * stride.y + padding.y;\n";
-  c += "  int zs = Z * stride.z + padding.z;\n";
-  c += "  for (int ky = 0; ky < kernel_size.y; ++ky) {\n";
+  c += "  int ys = Y * args.stride_y + args.padding_y;\n";
+  c += "  for (int ky = 0; ky < args.kernel_size_y; ++ky) {\n";
   c += "    int y_c = ys + ky;\n";
-  c += "    if (y_c < 0 || y_c >= src_size.y) continue;\n";
-  c += "    for (int kx = 0; kx < kernel_size.x; ++kx) {\n";
+  c += "    if (y_c < 0 || y_c >= args.src_tensor.Height()) continue;\n";
+  c += "    for (int kx = 0; kx < args.kernel_size_x; ++kx) {\n";
   if (op_def.IsBatchSupported()) {
-    c += "      int x_c = xs + kx * batch_size;\n";
+    c += "      int x_c = xs + kx * args.src_tensor.Batch();\n";
   } else {
     c += "      int x_c = xs + kx;\n";
   }
-  c += "      if (x_c < 0 || x_c >= src_size.x) continue;\n";
-  c += "      for (int kz = 0; kz < kernel_size.z; ++kz) {\n";
-  c += "        int z_c = zs + kz;\n";
-  c += "        if (z_c < 0 || z_c >= src_size.z) continue;\n";
-  c += "        FLT4 src = " + src_tensor.ReadWHDS("x_c", "y_c", "z_c", "S") +
-       ";\n";
-  if (output_indices) {
-    c += "        FLT index_counter = (FLT)((ky * kernel_size.x + kx) * "
-         "kernel_size.z + kz) + (FLT)(0.1f);\n";
-    c += "        if (src.x > maximum.x) {\n";
-    c += "          indexes.x = index_counter;\n";
-    c += "          maximum.x = src.x;\n";
-    c += "        }\n";
-    c += "        if (src.y > maximum.y) {\n";
-    c += "          indexes.y = index_counter;\n";
-    c += "          maximum.y = src.y;\n";
-    c += "        }\n";
-    c += "        if (src.z > maximum.z) {\n";
-    c += "          indexes.z = index_counter;\n";
-    c += "          maximum.z = src.z;\n";
-    c += "        }\n";
-    c += "        if (src.w > maximum.w) {\n";
-    c += "          indexes.w = index_counter;\n";
-    c += "          maximum.w = src.w;\n";
-    c += "        }\n";
-  } else {
-    c += "        maximum = max(src, maximum);\n";
+  c += "      if (x_c < 0 || x_c >= args.src_tensor.Width()) continue;\n";
+  if (op_def.dst_tensors[0].HasAxis(Axis::DEPTH)) {
+    c += "    int ds = D * args.stride_z + args.padding_z;\n";
+    c += "    for (int kz = 0; kz < args.kernel_size_z; ++kz) {\n";
+    c += "    int d_c = ds + kz;\n";
+    c += "      if (d_c < 0 || d_c >= args.src_tensor.Depth()) continue;\n";
+  }
+  c += "      FLT4 src = args.src_tensor.Read(" + src_coord + ");\n";
+  if (output_indices) {
+    if (op_def.dst_tensors[0].HasAxis(Axis::DEPTH)) {
+      c += "      FLT index_counter = (FLT)((ky * args.kernel_size_x + kx) * "
+           "args.kernel_size_z + kz) + (FLT)(0.1f);\n";
+    } else {
+      c += "      FLT index_counter = (FLT)(ky * args.kernel_size_x + kx) + "
+           "(FLT)(0.1f);\n";
+    }
+    c += "      if (src.x > maximum.x) {\n";
+    c += "        indexes.x = index_counter;\n";
+    c += "        maximum.x = src.x;\n";
+    c += "      }\n";
+    c += "      if (src.y > maximum.y) {\n";
+    c += "        indexes.y = index_counter;\n";
+    c += "        maximum.y = src.y;\n";
+    c += "      }\n";
+    c += "      if (src.z > maximum.z) {\n";
+    c += "        indexes.z = index_counter;\n";
+    c += "        maximum.z = src.z;\n";
+    c += "      }\n";
+    c += "      if (src.w > maximum.w) {\n";
+    c += "        indexes.w = index_counter;\n";
+    c += "        maximum.w = src.w;\n";
+    c += "      }\n";
+  } else {
+    c += "      maximum = max(src, maximum);\n";
+  }
+  if (op_def.dst_tensors[0].HasAxis(Axis::DEPTH)) {
+    c += "    }  // Depth\n";
   }
-  c += "      };\n";
   c += "    }\n";
   c += "  }\n";
-  const LinkingContext context{"maximum", "X", "Y", "Z"};
-  c += PostProcess(linked_operations, context);
-  c += "  " + dst_tensor.WriteWHDS("maximum", "X", "Y", "Z", "S");
+  c += "  args.dst_tensor.Write(maximum, " + dst_coord + ");\n";
   if (output_indices) {
-    c += "  " + indices_tensor.WriteWHDS("indexes", "X", "Y", "Z", "S");
+    c += "  args.dst_indices.Write(indexes, " + dst_coord + ");\n";
   }
   c += "}\n";
+
   return c;
 }
-
 }  // namespace
 
 Pooling::Pooling(const OperationDef& definition,
                  const Pooling2DAttributes& attr)
     : GPUOperation(definition),
-      stride_(attr.strides.w, attr.strides.h),
-      padding_(-attr.padding.prepended.w, -attr.padding.prepended.h),
-      kernel_size_(attr.kernel.w, attr.kernel.h),
+      stride_(attr.strides.w, attr.strides.h, 0, 0),
+      padding_(-attr.padding.prepended.w, -attr.padding.prepended.h, 0, 0),
+      kernel_size_(attr.kernel.w, attr.kernel.h, 0, 0),
+      type_(attr.type),
+      output_indices_(attr.output_indices) {}
+
+Pooling::Pooling(const OperationDef& definition,
+                 const Pooling3DAttributes& attr)
+    : GPUOperation(definition),
+      stride_(attr.strides.w, attr.strides.h, attr.strides.d, 0),
+      padding_(-attr.padding.prepended.w, -attr.padding.prepended.h,
+               -attr.padding.prepended.d, 0),
+      kernel_size_(attr.kernel.w, attr.kernel.h, attr.kernel.d, 0),
       type_(attr.type),
       output_indices_(attr.output_indices) {}
 
@@ -419,44 +360,56 @@ absl::Status Pooling::Compile(const CreationContext& creation_context) {
   switch (type_) {
     case PoolingType::AVERAGE:
       code = GetAveragePoolingKernelCode(definition_, stride_correction,
-                                         *creation_context.device,
-                                         linked_operations_);
+                                         *creation_context.device, &args_);
       break;
     case PoolingType::MAX:
       code = GetMaxPoolingKernelCode(definition_, stride_correction,
-                                     linked_operations_, output_indices_);
+                                     output_indices_, &args_);
       break;
     default:
       return absl::InvalidArgumentError(
           "You should create another kernel with this params");
       break;
   }
+  std::string element_wise_code;
+  RETURN_IF_ERROR(
+      MergeOperations(linked_operations_, &args_, &element_wise_code));
+  RETURN_IF_ERROR(args_.TransformToCLCode(creation_context.device->GetInfo(),
+                                          {{"dst_tensor", element_wise_code}},
+                                          &code));
   return creation_context.cache->GetOrCreateCLKernel(
       code, "main_function", *creation_context.context,
       *creation_context.device, &kernel_);
 }
 
 absl::Status Pooling::BindArguments() {
-  kernel_.ResetBindingCounter();
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
-  RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtrForWriting()));
-  if (output_indices_) {
-    RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[1]->GetMemoryPtrForWriting()));
+  RETURN_IF_ERROR(args_.SetObjectRef("src_tensor", src_[0]));
+  RETURN_IF_ERROR(args_.SetObjectRef("dst_tensor", dst_[0]));
+  if (definition_.dst_tensors[0].HasAxis(Axis::WIDTH)) {
+    RETURN_IF_ERROR(args_.SetInt("stride_x", stride_.x));
+    RETURN_IF_ERROR(args_.SetInt("padding_x", padding_.x * src_[0]->Batch()));
+    RETURN_IF_ERROR(args_.SetInt("kernel_size_x", kernel_size_.x));
   }
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetWBatchedHSB()));
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetWBatchedHSB()));
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(kernel_size_));
-  RETURN_IF_ERROR(
-      kernel_.SetBytesAuto(int2(padding_.x * src_[0]->Batch(), padding_.y)));
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(stride_));
-
-  return absl::OkStatus();
+  if (definition_.dst_tensors[0].HasAxis(Axis::HEIGHT)) {
+    RETURN_IF_ERROR(args_.SetInt("stride_y", stride_.y));
+    RETURN_IF_ERROR(args_.SetInt("padding_y", padding_.y));
+    RETURN_IF_ERROR(args_.SetInt("kernel_size_y", kernel_size_.y));
+  }
+  if (definition_.dst_tensors[0].HasAxis(Axis::DEPTH)) {
+    RETURN_IF_ERROR(args_.SetInt("stride_z", stride_.z));
+    RETURN_IF_ERROR(args_.SetInt("padding_z", padding_.z));
+    RETURN_IF_ERROR(args_.SetInt("kernel_size_z", kernel_size_.z));
+  }
+  if (output_indices_) {
+    RETURN_IF_ERROR(args_.SetObjectRef("dst_indices", dst_[1]));
+  }
+  RETURN_IF_ERROR(SetArguments(linked_operations_, &args_));
+  return args_.Bind(kernel_.kernel());
 }
 
 int3 Pooling::GetGridSize() const {
   const int grid_x = dst_[0]->Width() * dst_[0]->Batch();
-  const int grid_y = dst_[0]->Height();
+  const int grid_y = dst_[0]->Height() * dst_[0]->Depth();
   const int grid_z = dst_[0]->Slices();
   return int3(grid_x, grid_y, grid_z);
 }
@@ -476,107 +429,9 @@ Pooling CreatePooling(const OperationDef& definition,
   return Pooling(definition, attr);
 }
 
-Pooling3D::Pooling3D(const OperationDef& definition,
-                     const Pooling3DAttributes& attr)
-    : GPUOperation(definition),
-      stride_(attr.strides.w, attr.strides.h, attr.strides.d),
-      padding_(-attr.padding.prepended.w, -attr.padding.prepended.h,
-               -attr.padding.prepended.d),
-      kernel_size_(attr.kernel.w, attr.kernel.h, attr.kernel.d),
-      type_(attr.type),
-      output_indices_(attr.output_indices) {}
-
-Pooling3D::Pooling3D(Pooling3D&& kernel)
-    : GPUOperation(std::move(kernel)),
-      stride_(kernel.stride_),
-      padding_(kernel.padding_),
-      kernel_size_(kernel.kernel_size_),
-      type_(kernel.type_),
-      output_indices_(kernel.output_indices_),
-      kernel_(std::move(kernel.kernel_)),
-      work_group_size_(kernel.work_group_size_) {}
-
-Pooling3D& Pooling3D::operator=(Pooling3D&& kernel) {
-  if (this != &kernel) {
-    std::swap(stride_, kernel.stride_);
-    std::swap(padding_, kernel.padding_);
-    std::swap(kernel_size_, kernel.kernel_size_);
-    std::swap(type_, kernel.type_);
-    std::swap(output_indices_, kernel.output_indices_);
-    kernel_ = std::move(kernel.kernel_);
-    std::swap(work_group_size_, kernel.work_group_size_);
-    GPUOperation::operator=(std::move(kernel));
-  }
-  return *this;
-}
-
-absl::Status Pooling3D::Compile(const CreationContext& creation_context) {
-  std::string code;
-  const bool stride_correction =
-      definition_.IsBatchSupported() && stride_.x != 1;
-  switch (type_) {
-    case PoolingType::AVERAGE:
-      code = GetAveragePooling3DKernelCode(definition_, stride_correction,
-                                           *creation_context.device,
-                                           linked_operations_);
-      break;
-    case PoolingType::MAX:
-      code = GetMaxPooling3DKernelCode(definition_, stride_correction,
-                                       linked_operations_, output_indices_);
-      break;
-    default:
-      return absl::InvalidArgumentError(
-          "You should create another kernel with this params");
-      break;
-  }
-  return creation_context.cache->GetOrCreateCLKernel(
-      code, "main_function", *creation_context.context,
-      *creation_context.device, &kernel_);
-}
-
-absl::Status Pooling3D::BindArguments() {
-  kernel_.ResetBindingCounter();
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
-  RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtrForWriting()));
-  if (output_indices_) {
-    RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[1]->GetMemoryPtrForWriting()));
-  }
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetWBatchedHDS()));
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetWBatchedHDS()));
-  if (definition_.IsBatchSupported()) {
-    RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->Batch()));
-  }
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(
-      int4(kernel_size_.x, kernel_size_.y, kernel_size_.z, 1)));
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(
-      int4(padding_.x * src_[0]->Batch(), padding_.y, padding_.z, 1)));
-  RETURN_IF_ERROR(
-      kernel_.SetBytesAuto(int4(stride_.x, stride_.y, stride_.z, 1)));
-
-  return absl::OkStatus();
-}
-
-int3 Pooling3D::GetGridSize() const {
-  const int grid_x = dst_[0]->Width() * dst_[0]->Batch();
-  const int grid_y = dst_[0]->Height();
-  const int grid_z = dst_[0]->Slices() * dst_[0]->Depth();
-  return int3(grid_x, grid_y, grid_z);
-}
-
-absl::Status Pooling3D::Tune(const TuningParameters& params) {
-  RETURN_IF_ERROR(BindArguments());
-  return GetBestWorkGroup(params, kernel_, GetGridSize(), &work_group_size_);
-}
-
-absl::Status Pooling3D::AddToQueue(CLCommandQueue* queue) {
-  RETURN_IF_ERROR(BindArguments());
-  return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
-}
-
-Pooling3D CreatePooling3D(const OperationDef& definition,
-                          const Pooling3DAttributes& attr) {
-  return Pooling3D(definition, attr);
+Pooling CreatePooling(const OperationDef& definition,
+                      const Pooling3DAttributes& attr) {
+  return Pooling(definition, attr);
 }
 
 }  // namespace cl
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/pooling.h b/tensorflow/lite/delegates/gpu/cl/kernels/pooling.h
index 09d2d5260f7..20719c90ae3 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/pooling.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/pooling.h
@@ -30,6 +30,7 @@ namespace cl {
 class Pooling : public GPUOperation {
  public:
   Pooling(const OperationDef& definition, const Pooling2DAttributes& attr);
+  Pooling(const OperationDef& definition, const Pooling3DAttributes& attr);
   absl::Status AddToQueue(CLCommandQueue* queue) override;
   absl::Status Tune(const TuningParameters& params) override;
 
@@ -45,9 +46,9 @@ class Pooling : public GPUOperation {
   absl::Status BindArguments();
   int3 GetGridSize() const;
 
-  int2 stride_;
-  int2 padding_;
-  int2 kernel_size_;
+  int4 stride_;
+  int4 padding_;
+  int4 kernel_size_;
 
   PoolingType type_;
   bool output_indices_;
@@ -59,37 +60,8 @@ class Pooling : public GPUOperation {
 Pooling CreatePooling(const OperationDef& definition,
                       const Pooling2DAttributes& attr);
 
-class Pooling3D : public GPUOperation {
- public:
-  Pooling3D(const OperationDef& definition, const Pooling3DAttributes& attr);
-  absl::Status AddToQueue(CLCommandQueue* queue) override;
-  absl::Status Tune(const TuningParameters& params) override;
-
-  absl::Status Compile(const CreationContext& creation_context) override;
-
-  // Move only
-  Pooling3D(Pooling3D&& kernel);
-  Pooling3D& operator=(Pooling3D&& kernel);
-  Pooling3D(const Pooling3D&) = delete;
-  Pooling3D& operator=(const Pooling3D&) = delete;
-
- private:
-  absl::Status BindArguments();
-  int3 GetGridSize() const;
-
-  int3 stride_;
-  int3 padding_;
-  int3 kernel_size_;
-
-  PoolingType type_;
-  bool output_indices_;
-
-  CLKernel kernel_;
-  int3 work_group_size_ = int3(8, 4, 1);
-};
-
-Pooling3D CreatePooling3D(const OperationDef& definition,
-                          const Pooling3DAttributes& attr);
+Pooling CreatePooling(const OperationDef& definition,
+                      const Pooling3DAttributes& attr);
 
 }  // namespace cl
 }  // namespace gpu
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/prelu.h b/tensorflow/lite/delegates/gpu/cl/kernels/prelu.h
index 68e4c7b7626..80ee49f77ca 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/prelu.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/prelu.h
@@ -69,17 +69,17 @@ absl::Status CreatePReLU(const CreationContext& creation_context,
 template <DataType T>
 absl::Status PReLU::UploadParameters(
     const tflite::gpu::Tensor<Linear, T>& parameters, CLContext* context) {
-  LinearStorageCreateInfo create_info;
-  create_info.storage_type =
+  TensorLinearDescriptor desc;
+  desc.storage_type =
       DeduceLinearStorageType(definition_.GetPrimaryStorageType());
-  create_info.data_type = definition_.GetPrimaryDataType();
-  RETURN_IF_ERROR(
-      CreateLinearStorage(create_info, parameters, context, &alpha_));
+  desc.element_type = definition_.GetPrimaryDataType();
+  RETURN_IF_ERROR(CreateLinearStorage(desc, parameters, context, &alpha_));
 
   LinearStorage lt;
-  RETURN_IF_ERROR(CreateLinearStorage(create_info, parameters, context, &lt));
+  RETURN_IF_ERROR(CreateLinearStorage(desc, parameters, context, &lt));
   args_.AddObject("alpha", AccessType::READ,
-                  absl::make_unique<LinearStorage>(std::move(lt)));
+                  absl::make_unique<LinearStorage>(std::move(lt)),
+                  absl::make_unique<TensorLinearDescriptor>(desc));
 
   return absl::OkStatus();
 }
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/reshape.cc b/tensorflow/lite/delegates/gpu/cl/kernels/reshape.cc
index e1589e9d682..5abfad60c1b 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/reshape.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/reshape.cc
@@ -25,92 +25,29 @@ namespace gpu {
 namespace cl {
 namespace {
 
-std::string GetReshapeBatchedCode(
-    const OperationDef& op_def,
-    const std::vector<ElementwiseOperation*>& linked_operations) {
-  TensorCodeGenerator src_tensor(
-      "src_data",
-      WHSBPoint{"src_size.x", "src_size.y", "src_size.z", "src_size.w"},
-      op_def.src_tensors[0]);
-  TensorCodeGenerator dst_tensor(
-      "dst_data",
-      WHSBPoint{"dst_size.x", "dst_size.y", "dst_size.z", "dst_size.w"},
-      op_def.dst_tensors[0]);
+std::string GetReshapeCode(const OperationDef& op_def, Arguments* args) {
+  args->AddObjectRef(
+      "src_tensor", AccessType::READ,
+      absl::make_unique<TensorDescriptor>(op_def.src_tensors[0]));
+  args->AddObjectRef(
+      "dst_tensor", AccessType::WRITE,
+      absl::make_unique<TensorDescriptor>(op_def.dst_tensors[0]));
 
   std::string c = GetCommonDefines(op_def.precision);
   c += "__kernel void main_function(\n";
-  c += src_tensor.GetDeclaration(AccessType::READ);
-  c += GetArgsDeclaration(linked_operations);
-  c += dst_tensor.GetDeclaration(AccessType::WRITE) + ",\n";
-  c += "    int4 src_size,             \n";
-  c += "    int4 dst_size,             \n";
-  c += "    int src_channels,          \n";
-  c += "    int dst_channels           \n";
-  c += ") {\n";
-  c += "  int linear_id = get_global_id(0);\n";
-  c += "  int X = linear_id / dst_size.w;\n";
-  c += "  int B = linear_id % dst_size.w;\n";
+  c += "$0) {\n";
+  if (op_def.dst_tensors[0].HasAxis(Axis::BATCH)) {
+    c += "  int linear_id = get_global_id(0);\n";
+    c += "  int X = linear_id / args.dst_tensor.Batch();\n";
+    c += "  int B = linear_id % args.dst_tensor.Batch();\n";
+    c += "  args.dst_tensor.SetBatchRef(B);\n";
+  } else {
+    c += "  int X = get_global_id(0);\n";
+  }
   c += "  int Y = get_global_id(1);\n";
   c += "  int Z = get_global_id(2);\n";
-  c += "  if (X >= dst_size.x || Y >= dst_size.y || Z >= dst_size.z || B >= "
-       "dst_size.w) return;\n";
-  c += "  FLT temps[4];\n";
-  c += "  temps[0] = (FLT)(0.0f);\n";
-  c += "  temps[1] = (FLT)(0.0f);\n";
-  c += "  temps[2] = (FLT)(0.0f);\n";
-  c += "  temps[3] = (FLT)(0.0f);\n";
-  c += "  int base = ((B * dst_size.y + Y)* dst_size.x + X)* dst_channels + Z "
-       "* 4;\n";
-  c += "  for (int i = 0; i < 4; ++i) {\n";
-  c += "    int dst_channel = Z * 4 + i;\n";
-  c += "    if (dst_channel < dst_channels) {;\n";
-  c += "      int p = base + i;\n";
-  c += "      int src_c = p % src_channels;\n";
-  c += "      p = p / src_channels;\n";
-  c += "      int src_x = p % src_size.x;\n";
-  c += "      p = p / src_size.x;\n";
-  c += "      int src_y = p % src_size.y;\n";
-  c += "      int src_b = p / src_size.y;\n";
-  c += "      int src_z = src_c / 4;\n";
-  c += "      int src_sub_ch = src_c % 4;\n";
-  c += "      FLT4 t =" +
-       src_tensor.ReadWHSB("src_x", "src_y", "src_z", "src_b") + ";\n";
-  c += "      FLT t_ar[4] = {t.x, t.y, t.z, t.w};\n";
-  c += "      temps[i] = t_ar[src_sub_ch];\n";
-  c += "    }\n";
-  c += "  }\n";
-  c += "  FLT4 result = (FLT4)(temps[0], temps[1], temps[2], temps[3]);\n";
-  const LinkingContext context{"result", "X * dst_size.w + B", "Y", "Z"};
-  c += PostProcess(linked_operations, context);
-  c += "  " + dst_tensor.WriteWHSB("result", "X", "Y", "Z", "B");
-  c += "}\n";
-  return c;
-}
-
-std::string GetReshapeCode(
-    const OperationDef& op_def,
-    const std::vector<ElementwiseOperation*>& linked_operations) {
-  TensorCodeGenerator src_tensor(
-      "src_data", WHSPoint{"src_size.x", "src_size.y", "src_size.z"},
-      op_def.src_tensors[0]);
-  TensorCodeGenerator dst_tensor(
-      "dst_data", WHSPoint{"dst_size.x", "dst_size.y", "dst_size.z"},
-      op_def.dst_tensors[0]);
-
-  std::string c = GetCommonDefines(op_def.precision);
-  c += "__kernel void main_function(\n";
-  c += src_tensor.GetDeclaration(AccessType::READ);
-  c += GetArgsDeclaration(linked_operations);
-  c += dst_tensor.GetDeclaration(AccessType::WRITE) + ",\n";
-  c += "    int4 src_size,             \n";
-  c += "    int4 dst_size,             \n";
-  c += "    int src_channels,          \n";
-  c += "    int dst_channels           \n";
-  c += ") {\n";
-  c += "  int X = get_global_id(0);\n";
-  c += "  int Y = get_global_id(1);\n";
-  c += "  int Z = get_global_id(2);\n";
-  c += "  if (X >= dst_size.x || Y >= dst_size.y || Z >= dst_size.z) { \n";
+  c += "  if (X >= args.dst_tensor.Width() || Y >= args.dst_tensor.Height() || "
+       "Z >= args.dst_tensor.Slices()) { \n";
   c += "    return; \n";
   c += "  } \n";
   c += "  FLT temps[4];\n";
@@ -118,25 +55,35 @@ std::string GetReshapeCode(
   c += "  temps[1] = (FLT)(0.0f);\n";
   c += "  temps[2] = (FLT)(0.0f);\n";
   c += "  temps[3] = (FLT)(0.0f);\n";
+  if (op_def.dst_tensors[0].HasAxis(Axis::BATCH)) {
+    c += "  int base = B;\n";
+  } else {
+    c += "  int base = 0;\n";
+  }
+  c += "  base = ((base * args.dst_tensor.Height() + Y) * "
+       "args.dst_tensor.Width() + X) * args.dst_tensor.Channels() + Z * 4;\n";
   c += "  for (int i = 0; i < 4; ++i) {\n";
   c += "    int dst_channel = Z * 4 + i;\n";
-  c += "    if (dst_channel < dst_channels) {;\n";
-  c += "      int p = dst_channel + dst_channels * (X + dst_size.x * Y);\n";
-  c += "      int src_c = p % src_channels;\n";
-  c += "      p = p / src_channels;\n";
-  c += "      int src_x = p % src_size.x;\n";
-  c += "      int src_y = p / src_size.x;\n";
+  c += "    if (dst_channel < args.dst_tensor.Channels()) {;\n";
+  c += "      int p = base + i;\n";
+  c += "      int src_c = p % args.src_tensor.Channels();\n";
+  c += "      p = p / args.src_tensor.Channels();\n";
+  c += "      int src_x = p % args.src_tensor.Width();\n";
+  c += "      p = p / args.src_tensor.Width();\n";
+  c += "      int src_y = p % args.src_tensor.Height();\n";
+  if (op_def.src_tensors[0].HasAxis(Axis::BATCH)) {
+    c += "  int src_b = p / args.src_tensor.Height();\n";
+    c += "  args.src_tensor.SetBatchRef(src_b);\n";
+  }
   c += "      int src_z = src_c / 4;\n";
   c += "      int src_sub_ch = src_c % 4;\n";
-  c += "      FLT4 t =" + src_tensor.ReadWHS("src_x", "src_y", "src_z") + ";\n";
+  c += "      FLT4 t = args.src_tensor.Read(src_x, src_y, src_z);\n";
   c += "      FLT t_ar[4] = {t.x, t.y, t.z, t.w};\n";
   c += "      temps[i] = t_ar[src_sub_ch];\n";
   c += "    }\n";
   c += "  }\n";
   c += "  FLT4 result = (FLT4)(temps[0], temps[1], temps[2], temps[3]);\n";
-  const LinkingContext context{"result", "X", "Y", "Z"};
-  c += PostProcess(linked_operations, context);
-  c += "  " + dst_tensor.WriteWHS("result", "X", "Y", "Z");
+  c += "  args.dst_tensor.Write(result, X, Y, Z);\n";
   c += "}\n";
   return c;
 }
@@ -157,24 +104,23 @@ Reshape& Reshape::operator=(Reshape&& operation) {
 }
 
 absl::Status Reshape::Compile(const CreationContext& creation_context) {
-  const auto code = definition_.IsBatchSupported()
-                        ? GetReshapeBatchedCode(definition_, linked_operations_)
-                        : GetReshapeCode(definition_, linked_operations_);
+  std::string code = GetReshapeCode(definition_, &args_);
+  std::string element_wise_code;
+  RETURN_IF_ERROR(
+      MergeOperations(linked_operations_, &args_, &element_wise_code));
+  RETURN_IF_ERROR(args_.TransformToCLCode(creation_context.device->GetInfo(),
+                                          {{"dst_tensor", element_wise_code}},
+                                          &code));
   return creation_context.cache->GetOrCreateCLKernel(
       code, "main_function", *creation_context.context,
       *creation_context.device, &kernel_);
 }
 
 absl::Status Reshape::BindArguments() {
-  kernel_.ResetBindingCounter();
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
-  RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtrForWriting()));
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetWHSB()));
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetWHSB()));
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->Channels()));
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->Channels()));
-  return absl::OkStatus();
+  RETURN_IF_ERROR(args_.SetObjectRef("src_tensor", src_[0]));
+  RETURN_IF_ERROR(args_.SetObjectRef("dst_tensor", dst_[0]));
+  RETURN_IF_ERROR(SetArguments(linked_operations_, &args_));
+  return args_.Bind(kernel_.kernel());
 }
 
 int3 Reshape::GetGridSize() const {
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/reshapex4.cc b/tensorflow/lite/delegates/gpu/cl/kernels/reshapex4.cc
index de6813e741f..3edbe637aa2 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/reshapex4.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/reshapex4.cc
@@ -25,82 +25,49 @@ namespace gpu {
 namespace cl {
 namespace {
 
-std::string GetReshapeBatchedCode(
-    const OperationDef& op_def,
-    const std::vector<ElementwiseOperation*>& linked_operations) {
-  TensorCodeGenerator src_tensor(
-      "src_data",
-      WHSBPoint{"src_size.x", "src_size.y", "src_size.z", "src_size.w"},
-      op_def.src_tensors[0]);
-  TensorCodeGenerator dst_tensor(
-      "dst_data",
-      WHSBPoint{"dst_size.x", "dst_size.y", "dst_size.z", "dst_size.w"},
-      op_def.dst_tensors[0]);
+std::string GetReshapeCode(const OperationDef& op_def, Arguments* args) {
+  args->AddObjectRef(
+      "src_tensor", AccessType::READ,
+      absl::make_unique<TensorDescriptor>(op_def.src_tensors[0]));
+  args->AddObjectRef(
+      "dst_tensor", AccessType::WRITE,
+      absl::make_unique<TensorDescriptor>(op_def.dst_tensors[0]));
 
   std::string c = GetCommonDefines(op_def.precision);
   c += "__kernel void main_function(\n";
-  c += src_tensor.GetDeclaration(AccessType::READ);
-  c += GetArgsDeclaration(linked_operations);
-  c += dst_tensor.GetDeclaration(AccessType::WRITE) + ",\n";
-  c += "    int4 src_size,             \n";
-  c += "    int4 dst_size              \n";
-  c += ") {\n";
-  c += "  int linear_id = get_global_id(0);\n";
-  c += "  int X = linear_id / dst_size.w;\n";
-  c += "  int B = linear_id % dst_size.w;\n";
+  c += "$0) {\n";
+  if (op_def.dst_tensors[0].HasAxis(Axis::BATCH)) {
+    c += "  int linear_id = get_global_id(0);\n";
+    c += "  int X = linear_id / args.dst_tensor.Batch();\n";
+    c += "  int B = linear_id % args.dst_tensor.Batch();\n";
+    c += "  args.dst_tensor.SetBatchRef(B);\n";
+  } else {
+    c += "  int X = get_global_id(0);\n";
+  }
   c += "  int Y = get_global_id(1);\n";
   c += "  int Z = get_global_id(2);\n";
-  c += "  if (X >= dst_size.x || Y >= dst_size.y || Z >= dst_size.z || B >= "
-       "dst_size.w) return;\n";
-  c += "  int dst_bhwc4 = ((B * dst_size.y + Y) * dst_size.x + X) * dst_size.z "
-       "+ Z;\n";
-  c += "  int src_z = dst_bhwc4 % src_size.z;\n";
-  c += "  dst_bhwc4 = dst_bhwc4 / src_size.z;\n";
-  c += "  int src_x = dst_bhwc4 % src_size.x;\n";
-  c += "  dst_bhwc4 = dst_bhwc4 / src_size.x;\n";
-  c += "  int src_y = dst_bhwc4 % src_size.y;\n";
-  c += "  int src_b = dst_bhwc4 / src_size.y;\n";
-  c += "  FLT4 result =" +
-       src_tensor.ReadWHSB("src_x", "src_y", "src_z", "src_b") + ";\n";
-  const LinkingContext context{"result", "X * dst_size.w + B", "Y", "Z"};
-  c += PostProcess(linked_operations, context);
-  c += "  " + dst_tensor.WriteWHSB("result", "X", "Y", "Z", "B");
-  c += "}\n";
-  return c;
-}
-
-std::string GetReshapeCode(
-    const OperationDef& op_def,
-    const std::vector<ElementwiseOperation*>& linked_operations) {
-  TensorCodeGenerator src_tensor(
-      "src_data", WHSPoint{"src_size.x", "src_size.y", "src_size.z"},
-      op_def.src_tensors[0]);
-  TensorCodeGenerator dst_tensor(
-      "dst_data", WHSPoint{"dst_size.x", "dst_size.y", "dst_size.z"},
-      op_def.dst_tensors[0]);
-
-  std::string c = GetCommonDefines(op_def.precision);
-  c += "__kernel void main_function(\n";
-  c += src_tensor.GetDeclaration(AccessType::READ);
-  c += GetArgsDeclaration(linked_operations);
-  c += dst_tensor.GetDeclaration(AccessType::WRITE) + ",\n";
-  c += "    int4 src_size,             \n";
-  c += "    int4 dst_size              \n";
-  c += ") {\n";
-  c += "  int X = get_global_id(0);\n";
-  c += "  int Y = get_global_id(1);\n";
-  c += "  int Z = get_global_id(2);\n";
-  c += "  if (X >= dst_size.x || Y >= dst_size.y || Z >= dst_size.z) return;\n";
-  c += "  int dst_hwc4 = (Y * dst_size.x + X) * dst_size.z + Z;\n";
-  c += "  int src_z = dst_hwc4 % src_size.z;\n";
-  c += "  dst_hwc4 = dst_hwc4 / src_size.z;\n";
-  c += "  int src_x = dst_hwc4 % src_size.x;\n";
-  c += "  int src_y = dst_hwc4 / src_size.x;\n";
-  c +=
-      "  FLT4 result =" + src_tensor.ReadWHS("src_x", "src_y", "src_z") + ";\n";
-  const LinkingContext context{"result", "X", "Y", "Z"};
-  c += PostProcess(linked_operations, context);
-  c += "  " + dst_tensor.WriteWHS("result", "X", "Y", "Z");
+  c += "  if (X >= args.dst_tensor.Width() || Y >= args.dst_tensor.Height() || "
+       "Z >= args.dst_tensor.Slices()) { \n";
+  c += "    return; \n";
+  c += "  } \n";
+  if (op_def.dst_tensors[0].HasAxis(Axis::BATCH)) {
+    c += "  int dst_bhwc4 = B;\n";
+  } else {
+    c += "  int dst_bhwc4 = 0;\n";
+  }
+  c += "  dst_bhwc4 = ((dst_bhwc4 * args.dst_tensor.Height() + Y) * "
+       "args.dst_tensor.Width() + X) * args.dst_tensor.Slices() + Z;\n";
+  c += "  int src_z = dst_bhwc4 % args.src_tensor.Slices();\n";
+  c += "  dst_bhwc4 = dst_bhwc4 / args.src_tensor.Slices();\n";
+  c += "  int src_x = dst_bhwc4 % args.src_tensor.Width();\n";
+  c += "  dst_bhwc4 = dst_bhwc4 / args.src_tensor.Width();\n";
+  c += "  int src_y = dst_bhwc4 % args.src_tensor.Height();\n";
+  if (op_def.src_tensors[0].HasAxis(Axis::BATCH)) {
+    c += "  int src_b = dst_bhwc4 / args.src_tensor.Height();\n";
+    c += "  args.src_tensor.SetBatchRef(src_b);\n";
+  }
+  c += "  FLT4 result = args.src_tensor.Read(src_x, src_y, src_z);\n";
+  c += "  args.dst_tensor.Write(result, X, Y, Z);\n";
   c += "}\n";
   return c;
 }
@@ -121,22 +88,23 @@ Reshapex4& Reshapex4::operator=(Reshapex4&& operation) {
 }
 
 absl::Status Reshapex4::Compile(const CreationContext& creation_context) {
-  const auto code = definition_.IsBatchSupported()
-                        ? GetReshapeBatchedCode(definition_, linked_operations_)
-                        : GetReshapeCode(definition_, linked_operations_);
+  std::string code = GetReshapeCode(definition_, &args_);
+  std::string element_wise_code;
+  RETURN_IF_ERROR(
+      MergeOperations(linked_operations_, &args_, &element_wise_code));
+  RETURN_IF_ERROR(args_.TransformToCLCode(creation_context.device->GetInfo(),
+                                          {{"dst_tensor", element_wise_code}},
+                                          &code));
   return creation_context.cache->GetOrCreateCLKernel(
       code, "main_function", *creation_context.context,
       *creation_context.device, &kernel_);
 }
 
 absl::Status Reshapex4::BindArguments() {
-  kernel_.ResetBindingCounter();
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
-  RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtrForWriting()));
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetWHSB()));
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetWHSB()));
-  return absl::OkStatus();
+  RETURN_IF_ERROR(args_.SetObjectRef("src_tensor", src_[0]));
+  RETURN_IF_ERROR(args_.SetObjectRef("dst_tensor", dst_[0]));
+  RETURN_IF_ERROR(SetArguments(linked_operations_, &args_));
+  return args_.Bind(kernel_.kernel());
 }
 
 int3 Reshapex4::GetGridSize() const {
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/strided_slice.cc b/tensorflow/lite/delegates/gpu/cl/kernels/strided_slice.cc
index 2cf65f24447..d0c4e432f3a 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/strided_slice.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/strided_slice.cc
@@ -42,11 +42,12 @@ std::string GetStridedSliceCode(const OperationDef& op_def, bool alignedx4,
   args->AddInt("stride_z");
   args->AddInt("stride_b");
 
-  const std::string dst_batch = op_def.IsBatchSupported() ? "B" : "";
+  const std::string batch_id =
+      op_def.dst_tensors[0].HasAxis(Axis::BATCH) ? "B" : "0";
   std::string c = GetCommonDefines(op_def.precision);
   c += "__kernel void main_function(\n";
   c += "$0) {\n";
-  if (op_def.IsBatchSupported()) {
+  if (op_def.dst_tensors[0].HasAxis(Axis::BATCH)) {
     c += "  int linear_id = get_global_id(0);\n";
     c += "  int X = linear_id / args.dst_tensor.Batch();\n";
     c += "  int B = linear_id % args.dst_tensor.Batch();\n";
@@ -62,11 +63,10 @@ std::string GetStridedSliceCode(const OperationDef& op_def, bool alignedx4,
   c += "  } \n";
   c += "  int s_x = X * args.stride_x + args.offset_x;\n";
   c += "  int s_y = Y * args.stride_y + args.offset_y;\n";
-  if (op_def.IsBatchSupported()) {
-    c += "  int s_b = B * args.stride_b + args.offset_b;\n";
+  if (op_def.src_tensors[0].HasAxis(Axis::BATCH)) {
+    c += "  int s_b = " + batch_id + " * args.stride_b + args.offset_b;\n";
     c += "  args.src_tensor.SetBatchRef(s_b);\n";
   }
-  const std::string src_batch = op_def.IsBatchSupported() ? "s_b" : "";
   if (alignedx4) {
     c += "  int s_z = Z + args.offset_z;\n";
     c += "  FLT4 result = args.src_tensor.Read(s_x, s_y, s_z);\n";
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/transpose.cc b/tensorflow/lite/delegates/gpu/cl/kernels/transpose.cc
index e12c44566b7..cacfd52542d 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/transpose.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/transpose.cc
@@ -36,11 +36,12 @@ std::string GetTransposeCode(
       "dst_tensor", AccessType::WRITE,
       absl::make_unique<TensorDescriptor>(op_def.dst_tensors[0]));
 
-  const std::string batch_id = op_def.IsBatchSupported() ? "B" : "";
+  const std::string batch_id =
+      op_def.dst_tensors[0].HasAxis(Axis::BATCH) ? "B" : "0";
   std::string c = GetCommonDefines(op_def.precision);
   c += "__kernel void main_function(\n";
   c += "$0) {\n";
-  if (op_def.IsBatchSupported()) {
+  if (op_def.dst_tensors[0].HasAxis(Axis::BATCH)) {
     c += "  int linear_id = get_global_id(0);\n";
     c += "  int X = linear_id / args.dst_tensor.Batch();\n";
     c += "  int B = linear_id % args.dst_tensor.Batch();\n";
@@ -65,7 +66,7 @@ std::string GetTransposeCode(
   remap[attr.perm.w] = 2;
   remap[attr.perm.c] = 3;
   if (attr.perm.c == 3) {  // optimized reading when no channels permutation
-    const std::string bhw[] = {"B", "Y", "X"};
+    const std::string bhw[] = {batch_id, "Y", "X"};
     if (op_def.src_tensors[0].HasAxis(Axis::BATCH)) {
       c += "  args.src_tensor.SetBatchRef(" + bhw[remap[0]] + ");\n";
     }
@@ -80,7 +81,7 @@ std::string GetTransposeCode(
     c += "  for (int i = 0; i < 4; ++i) {\n";
     c += "    int dst_channel = Z * 4 + i;\n";
     c += "    if (dst_channel < args.dst_tensor.Channels()) {\n";
-    const std::string bhwc[] = {"B", "Y", "X", "dst_channel"};
+    const std::string bhwc[] = {batch_id, "Y", "X", "dst_channel"};
     if (op_def.src_tensors[0].HasAxis(Axis::BATCH)) {
       c += "      args.src_tensor.SetBatchRef(" + bhwc[remap[0]] + ");\n";
     }
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/util.cc b/tensorflow/lite/delegates/gpu/cl/kernels/util.cc
index 3161a73a18f..8cfff18b4ee 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/util.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/util.cc
@@ -60,15 +60,6 @@ std::string GetImageModifier(AccessType access) {
   }
 }
 
-std::string TextureAddressModeToString(TextureAddressMode address_mode) {
-  switch (address_mode) {
-    case TextureAddressMode::DONT_CARE:
-      return "smp_none";
-    case TextureAddressMode::ZERO:
-      return "smp_zero";
-  }
-}
-
 }  // namespace
 
 std::string GetCommonDefines(CalculationsPrecision precision) {
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/util.h b/tensorflow/lite/delegates/gpu/cl/kernels/util.h
index 836a95f7407..3a51d064b40 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/util.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/util.h
@@ -36,11 +36,6 @@ namespace cl {
 
 std::string GetCommonDefines(CalculationsPrecision precision);
 
-enum class TextureAddressMode {
-  DONT_CARE,  // translated to CLK_ADDRESS_NONE
-  ZERO,       // translated to CLK_ADDRESS_CLAMP
-};
-
 struct WHSPoint {
   std::string w_name;
   std::string h_name;
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/winograd.cc b/tensorflow/lite/delegates/gpu/cl/kernels/winograd.cc
index 2dcb72637ec..d38b72e61a6 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/winograd.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/winograd.cc
@@ -378,14 +378,15 @@ absl::Status Winograd4x4To36::UploadBt(CLContext* context) {
     bt_aligned.data[y * 8 + 7] = 0.0f;
   }
 
-  LinearStorageCreateInfo create_info;
-  create_info.storage_type = LinearStorageType::TEXTURE_2D;
-  create_info.data_type = definition_.GetDataType();
+  TensorLinearDescriptor desc;
+  desc.storage_type = LinearStorageType::TEXTURE_2D;
+  desc.element_type = definition_.GetDataType();
 
   LinearStorage lt;
-  RETURN_IF_ERROR(CreateLinearStorage(create_info, bt_aligned, context, &lt));
+  RETURN_IF_ERROR(CreateLinearStorage(desc, bt_aligned, context, &lt));
   args_.AddObject("bt", AccessType::READ,
-                  absl::make_unique<LinearStorage>(std::move(lt)));
+                  absl::make_unique<LinearStorage>(std::move(lt)),
+                  absl::make_unique<TensorLinearDescriptor>(desc));
   return absl::OkStatus();
 }
 
@@ -492,13 +493,14 @@ absl::Status Winograd36To4x4::UploadAt(CLContext* context) {
     at_aligned.data[y * 8 + 7] = 0.0f;
   }
 
-  LinearStorageCreateInfo create_info;
-  create_info.storage_type = LinearStorageType::TEXTURE_2D;
-  create_info.data_type = definition_.GetDataType();
+  TensorLinearDescriptor desc;
+  desc.storage_type = LinearStorageType::TEXTURE_2D;
+  desc.element_type = definition_.GetDataType();
   LinearStorage lt;
-  RETURN_IF_ERROR(CreateLinearStorage(create_info, at_aligned, context, &lt));
+  RETURN_IF_ERROR(CreateLinearStorage(desc, at_aligned, context, &lt));
   args_.AddObject("at", AccessType::READ,
-                  absl::make_unique<LinearStorage>(std::move(lt)));
+                  absl::make_unique<LinearStorage>(std::move(lt)),
+                  absl::make_unique<TensorLinearDescriptor>(desc));
   return absl::OkStatus();
 }
 
@@ -550,14 +552,15 @@ absl::Status CreateWinograd36To4x4(
     const tflite::gpu::Tensor<Linear, DataType::FLOAT32>& biases,
     Winograd36To4x4* result) {
   *result = Winograd36To4x4(definition);
-  LinearStorageCreateInfo create_info;
-  create_info.storage_type = LinearStorageType::TEXTURE_2D;
-  create_info.data_type = definition.GetDataType();
+  TensorLinearDescriptor desc;
+  desc.storage_type = LinearStorageType::TEXTURE_2D;
+  desc.element_type = definition.GetDataType();
   LinearStorage lt;
   RETURN_IF_ERROR(
-      CreateLinearStorage(create_info, biases, creation_context.context, &lt));
+      CreateLinearStorage(desc, biases, creation_context.context, &lt));
   result->args_.AddObject("biases", AccessType::READ,
-                          absl::make_unique<LinearStorage>(std::move(lt)));
+                          absl::make_unique<LinearStorage>(std::move(lt)),
+                          absl::make_unique<TensorLinearDescriptor>(desc));
   return result->UploadAt(creation_context.context);
 }
 
diff --git a/tensorflow/lite/delegates/gpu/cl/linear_storage.cc b/tensorflow/lite/delegates/gpu/cl/linear_storage.cc
index 84d91b9136e..ee0ea3efbec 100644
--- a/tensorflow/lite/delegates/gpu/cl/linear_storage.cc
+++ b/tensorflow/lite/delegates/gpu/cl/linear_storage.cc
@@ -32,6 +32,7 @@ GPUResources TensorLinearDescriptor::GetGPUResources(
     desc.data_type = element_type;
     desc.access_type = access_type;
     desc.element_size = 4;
+    desc.memory_type = memory_type;
     resources.buffers.push_back({"buffer", desc});
   } else {
     GPUImage2DDescriptor desc;
@@ -76,10 +77,7 @@ absl::Status TensorLinearDescriptor::PerformReadSelector(
 
 LinearStorage::LinearStorage(int depth, LinearStorageType storage_type,
                              DataType data_type)
-    : depth_(depth), storage_type_(storage_type), data_type_(data_type) {
-  desc_.storage_type = storage_type;
-  desc_.element_type = data_type;
-}
+    : depth_(depth), storage_type_(storage_type), data_type_(data_type) {}
 
 LinearStorage::LinearStorage(LinearStorage&& storage)
     : GPUObject(std::move(storage)),
@@ -89,8 +87,7 @@ LinearStorage::LinearStorage(LinearStorage&& storage)
       depth_(storage.depth_),
       name_(std::move(storage.name_)),
       storage_type_(storage.storage_type_),
-      data_type_(storage.data_type_),
-      desc_(storage.desc_) {
+      data_type_(storage.data_type_) {
   storage.memory_ = nullptr;
 }
 
@@ -103,7 +100,6 @@ LinearStorage& LinearStorage::operator=(LinearStorage&& storage) {
     name_ = std::move(storage.name_);
     std::swap(storage_type_, storage.storage_type_);
     std::swap(data_type_, storage.data_type_);
-    desc_ = storage.desc_;
     GPUObject::operator=(std::move(storage));
   }
   return *this;
diff --git a/tensorflow/lite/delegates/gpu/cl/linear_storage.h b/tensorflow/lite/delegates/gpu/cl/linear_storage.h
index 474c5652db2..14c8460bf80 100644
--- a/tensorflow/lite/delegates/gpu/cl/linear_storage.h
+++ b/tensorflow/lite/delegates/gpu/cl/linear_storage.h
@@ -40,20 +40,7 @@ enum class LinearStorageType { BUFFER, TEXTURE_2D };
 struct TensorLinearDescriptor : public GPUObjectDescriptor {
   LinearStorageType storage_type;
   DataType element_type;  // FLOAT32 or FLOAT16
-
-  TensorLinearDescriptor() = default;
-  TensorLinearDescriptor(const TensorLinearDescriptor& desc)
-      : GPUObjectDescriptor(desc),
-        storage_type(desc.storage_type),
-        element_type(desc.element_type) {}
-  TensorLinearDescriptor& operator=(const TensorLinearDescriptor& desc) {
-    if (this != &desc) {
-      storage_type = desc.storage_type;
-      element_type = desc.element_type;
-      GPUObjectDescriptor::operator=(desc);
-    }
-    return *this;
-  }
+  MemoryType memory_type = MemoryType::GLOBAL;  // applicable for BUFFER
 
   absl::Status PerformSelector(const std::string& selector,
                                const std::vector<std::string>& args,
@@ -92,9 +79,6 @@ class LinearStorage : public GPUObject {
   std::string ReadLinearFLT4(const std::string& z_coord) const;
   std::string GetDeclaration() const;
 
-  const GPUObjectDescriptor* GetGPUDescriptor() const override {
-    return &desc_;
-  }
   GPUResourcesWithValue GetGPUResources(AccessType access_type) const override;
 
  private:
@@ -115,7 +99,6 @@ class LinearStorage : public GPUObject {
   std::string name_;
   LinearStorageType storage_type_;
   DataType data_type_;
-  TensorLinearDescriptor desc_;
 };
 
 absl::Status CreateBufferLinearStorage(int size, DataType data_type, void* data,
@@ -152,6 +135,31 @@ absl::Status CreateLinearStorage(const LinearStorageCreateInfo& creation_info,
   return absl::OkStatus();
 }
 
+template <DataType T>
+absl::Status CreateLinearStorage(const TensorLinearDescriptor& descriptor,
+                                 const tflite::gpu::Tensor<Linear, T>& tensor,
+                                 CLContext* context, LinearStorage* result) {
+  LinearStorageCreateInfo creation_info;
+  creation_info.storage_type = descriptor.storage_type;
+  creation_info.data_type = descriptor.element_type;
+  int size = creation_info.aligned_size != 0 ? creation_info.aligned_size
+                                             : tensor.shape.v;
+  const int depth = DivideRoundUp(size, 4);
+  if (creation_info.data_type == DataType::FLOAT32) {
+    std::vector<float4> gpu_data(depth);
+    CopyLinearFLT4(tensor, absl::MakeSpan(gpu_data));
+    RETURN_IF_ERROR(CreateLinearStorage(creation_info, depth, gpu_data.data(),
+                                        context, result));
+  } else {
+    std::vector<half4> gpu_data(depth);
+    CopyLinearFLT4(tensor, absl::MakeSpan(gpu_data));
+    RETURN_IF_ERROR(CreateLinearStorage(creation_info, depth, gpu_data.data(),
+                                        context, result));
+  }
+  result->SetName(creation_info.name);
+  return absl::OkStatus();
+}
+
 }  // namespace cl
 }  // namespace gpu
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/selectors/convolution_transposed_selector.cc b/tensorflow/lite/delegates/gpu/cl/selectors/convolution_transposed_selector.cc
index 12e99b57aa7..5fdfdca073e 100644
--- a/tensorflow/lite/delegates/gpu/cl/selectors/convolution_transposed_selector.cc
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/convolution_transposed_selector.cc
@@ -112,6 +112,7 @@ absl::Status SelectConvolutionTransposed(
     case Vendor::POWERVR:
     case Vendor::NVIDIA:
     case Vendor::AMD:
+    case Vendor::INTEL:
       return SelectConvolutionTransposedPowerVR(attr, creation_context, op_def,
                                                 ptr);
     case Vendor::MALI:
diff --git a/tensorflow/lite/delegates/gpu/cl/selectors/fully_connected_selector.cc b/tensorflow/lite/delegates/gpu/cl/selectors/fully_connected_selector.cc
index 12a1d726368..eacbea8b586 100644
--- a/tensorflow/lite/delegates/gpu/cl/selectors/fully_connected_selector.cc
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/fully_connected_selector.cc
@@ -109,6 +109,9 @@ absl::Status SelectFullyConnected(const FullyConnectedAttributes& attr,
       return SelectFullyConnectedAdreno(attr, creation_context, op_def,
                                         batch_size, ptr);
     case Vendor::POWERVR:
+    case Vendor::AMD:
+    case Vendor::NVIDIA:
+    case Vendor::INTEL:
       return SelectFullyConnectedPowerVR(attr, creation_context, op_def,
                                          batch_size, ptr);
     case Vendor::MALI:
diff --git a/tensorflow/lite/delegates/gpu/cl/selectors/simple_selectors.cc b/tensorflow/lite/delegates/gpu/cl/selectors/simple_selectors.cc
index 5fc04d12822..d9bd70cc6b3 100644
--- a/tensorflow/lite/delegates/gpu/cl/selectors/simple_selectors.cc
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/simple_selectors.cc
@@ -105,8 +105,10 @@ absl::Status SelectConcat(const ConcatAttributes& attr,
       *ptr = absl::make_unique<ConcatZ>(std::move(operation));
       return absl::OkStatus();
     }
-    case Axis::WIDTH:
-    case Axis::HEIGHT: {
+    case Axis::BATCH:
+    case Axis::DEPTH:
+    case Axis::HEIGHT:
+    case Axis::WIDTH: {
       ConcatXY operation = CreateConcatXY(op_def, attr, channels.size());
       *ptr = absl::make_unique<ConcatXY>(std::move(operation));
       return absl::OkStatus();
diff --git a/tensorflow/lite/delegates/gpu/cl/tensor.h b/tensorflow/lite/delegates/gpu/cl/tensor.h
index c1b0b14709f..7de42a810ec 100644
--- a/tensorflow/lite/delegates/gpu/cl/tensor.h
+++ b/tensorflow/lite/delegates/gpu/cl/tensor.h
@@ -58,9 +58,6 @@ class Tensor : public GPUObject {
 
   virtual ~Tensor() { Release(); }
 
-  const GPUObjectDescriptor* GetGPUDescriptor() const override {
-    return &descriptor_;
-  }
   GPUResourcesWithValue GetGPUResources(AccessType access_type) const override;
 
   int Width() const { return shape_.w; }
diff --git a/tensorflow/lite/delegates/gpu/cl/tensor_type.cc b/tensorflow/lite/delegates/gpu/cl/tensor_type.cc
index 3b6c686a99a..0c3a1e3508c 100644
--- a/tensorflow/lite/delegates/gpu/cl/tensor_type.cc
+++ b/tensorflow/lite/delegates/gpu/cl/tensor_type.cc
@@ -45,6 +45,15 @@ std::string GetWriteImageFromDataType(DataType data_type) {
 
 }  // namespace
 
+std::string TextureAddressModeToString(TextureAddressMode address_mode) {
+  switch (address_mode) {
+    case TextureAddressMode::DONT_CARE:
+      return "smp_none";
+    case TextureAddressMode::ZERO:
+      return "smp_zero";
+  }
+}
+
 std::string ToString(TensorStorageType type) {
   switch (type) {
     case TensorStorageType::UNKNOWN:
@@ -159,6 +168,8 @@ absl::Status TensorDescriptor::PerformSelector(
     return PerformReadSelector(args, template_args, result);
   } else if (selector == "Write") {
     return PerformWriteSelector(args, result);
+  } else if (selector == "WriteLinear") {
+    return PerformWriteLinearSelector(args, result);
   } else if (selector == "GetAddress") {
     return PerformGetAddressSelector(args, result);
   } else {
@@ -244,6 +255,21 @@ absl::Status TensorDescriptor::PerformWriteSelector(
   return absl::OkStatus();
 }
 
+absl::Status TensorDescriptor::PerformWriteLinearSelector(
+    const std::vector<std::string>& args, std::string* result) const {
+  if (storage_type != TensorStorageType::BUFFER &&
+      storage_type != TensorStorageType::IMAGE_BUFFER) {
+    return absl::InvalidArgumentError(
+        "WriteLinear selector can be used only with linear "
+        "storages(BUFFER/IMAGE_BUFFER)");
+  }
+  if (args.size() != 2) {
+    return absl::NotFoundError("Unrecognized WriteLinear selector");
+  }
+  *result = Write(args[0], "(" + args[1] + ")");
+  return absl::OkStatus();
+}
+
 std::string TensorDescriptor::Read(DataType read_as_type,
                                    const std::string& global_address) const {
   const std::string read_as =
@@ -271,8 +297,10 @@ std::string TensorDescriptor::Read(DataType read_as_type,
     case TensorStorageType::TEXTURE_3D:
     case TensorStorageType::SINGLE_TEXTURE_2D:
     case TensorStorageType::TEXTURE_ARRAY:
-      return absl::StrCat(read_as, "(", image_type, ", smp_none, ",
-                          global_address, ")");
+      return absl::StrCat(
+          read_as, "(", image_type,
+          ", " + TextureAddressModeToString(ModeFromState()) + ", ",
+          global_address, ")");
     case TensorStorageType::IMAGE_BUFFER:
       return absl::StrCat(read_as, "(image_buffer, ", global_address, ")");
     case TensorStorageType::UNKNOWN:
@@ -500,6 +528,14 @@ bool TensorDescriptor::HasAxis(Axis axis) const {
   return false;
 }
 
+void TensorDescriptor::SetTextureAddressMode(TextureAddressMode mode) {
+  if (mode == TextureAddressMode::ZERO) {
+    state_vars_["TextureMode"] = "ZERO";
+  } else {
+    state_vars_["TextureMode"] = "DONT_CARE";
+  }
+}
+
 bool TensorDescriptor::ParseCoordsFromArgs(const std::vector<std::string>& args,
                                            int offset, std::string* xc,
                                            std::string* yc, std::string* zc,
@@ -549,6 +585,19 @@ bool TensorDescriptor::IsBatchedWidth() const {
   return it != state_vars_.end() && it->second == "true";
 }
 
+TextureAddressMode TensorDescriptor::ModeFromState() const {
+  auto it = state_vars_.find("TextureMode");
+  if (it != state_vars_.end()) {
+    if (it->second == "ZERO") {
+      return TextureAddressMode::ZERO;
+    } else {
+      return TextureAddressMode::DONT_CARE;
+    }
+  } else {
+    return TextureAddressMode::DONT_CARE;
+  }
+}
+
 }  // namespace cl
 }  // namespace gpu
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/tensor_type.h b/tensorflow/lite/delegates/gpu/cl/tensor_type.h
index 58ebfc51ec4..3a1d7abb01a 100644
--- a/tensorflow/lite/delegates/gpu/cl/tensor_type.h
+++ b/tensorflow/lite/delegates/gpu/cl/tensor_type.h
@@ -27,6 +27,13 @@ namespace tflite {
 namespace gpu {
 namespace cl {
 
+enum class TextureAddressMode {
+  DONT_CARE,  // translated to CLK_ADDRESS_NONE
+  ZERO,       // translated to CLK_ADDRESS_CLAMP
+};
+
+std::string TextureAddressModeToString(TextureAddressMode address_mode);
+
 enum class TensorStorageType {
   UNKNOWN,
   BUFFER,
@@ -41,20 +48,6 @@ struct TensorDescriptor : public GPUObjectDescriptor {
   TensorDescriptor() = default;
   TensorDescriptor(DataType dt, TensorStorageType st, Layout l)
       : data_type(dt), storage_type(st), layout(l) {}
-  TensorDescriptor(const TensorDescriptor& desc)
-      : GPUObjectDescriptor(desc),
-        data_type(desc.data_type),
-        storage_type(desc.storage_type),
-        layout(desc.layout) {}
-  TensorDescriptor& operator=(const TensorDescriptor& desc) {
-    if (this != &desc) {
-      data_type = desc.data_type;
-      storage_type = desc.storage_type;
-      layout = desc.layout;
-      GPUObjectDescriptor::operator=(desc);
-    }
-    return *this;
-  }
 
   bool operator==(const TensorDescriptor& d) const {
     return data_type == d.data_type && storage_type == d.storage_type &&
@@ -71,6 +64,7 @@ struct TensorDescriptor : public GPUObjectDescriptor {
   GPUResources GetGPUResources(AccessType access_type) const override;
 
   bool HasAxis(Axis axis) const;
+  void SetTextureAddressMode(TextureAddressMode mode);
 
   absl::Status GetLinkingContextFromWriteSelector(
       const std::vector<std::string>& args, std::string* value_name,
@@ -99,6 +93,9 @@ struct TensorDescriptor : public GPUObjectDescriptor {
   absl::Status PerformWriteSelector(const std::vector<std::string>& args,
                                     std::string* result) const;
 
+  absl::Status PerformWriteLinearSelector(const std::vector<std::string>& args,
+                                          std::string* result) const;
+
   std::string Read(DataType read_as_type,
                    const std::string& global_address) const;
   std::string Write(const std::string& var_name,
@@ -106,6 +103,8 @@ struct TensorDescriptor : public GPUObjectDescriptor {
 
   bool IsBatchedWidth() const;
 
+  TextureAddressMode ModeFromState() const;
+
   absl::Status GetDataTypeFromTemplateArgs(const std::string& template_arg,
                                            DataType* result) const;
 
diff --git a/tensorflow/lite/delegates/gpu/cl/texture2d.cc b/tensorflow/lite/delegates/gpu/cl/texture2d.cc
index 022c15660ce..1b774c40862 100644
--- a/tensorflow/lite/delegates/gpu/cl/texture2d.cc
+++ b/tensorflow/lite/delegates/gpu/cl/texture2d.cc
@@ -59,6 +59,41 @@ absl::Status CreateTexture2D(int width, int height, cl_channel_type type,
 }
 }  // namespace
 
+GPUResources Texture2DDescriptor::GetGPUResources(
+    AccessType access_type) const {
+  GPUResources resources;
+  GPUImage2DDescriptor desc;
+  desc.data_type = element_type;
+  desc.access_type = access_type;
+  resources.images2d.push_back({"tex2d", desc});
+  return resources;
+}
+
+absl::Status Texture2DDescriptor::PerformSelector(
+    const std::string& selector, const std::vector<std::string>& args,
+    const std::vector<std::string>& template_args, std::string* result) const {
+  if (selector == "Read") {
+    return PerformReadSelector(args, result);
+  } else {
+    return absl::NotFoundError(absl::StrCat(
+        "TensorLinearDescriptor don't have selector with name - ", selector));
+  }
+}
+
+absl::Status Texture2DDescriptor::PerformReadSelector(
+    const std::vector<std::string>& args, std::string* result) const {
+  if (args.size() != 2) {
+    return absl::NotFoundError(
+        absl::StrCat("Texture2DDescriptor Read require one argument, but ",
+                     args.size(), " was passed"));
+  }
+  const std::string read =
+      element_type == DataType::FLOAT16 ? "read_imageh" : "read_imagef";
+  *result = absl::StrCat(read, "(tex2d, smp_none, (int2)(", args[0],
+                         ", " + args[1] + "))");
+  return absl::OkStatus();
+}
+
 Texture2D::Texture2D(cl_mem texture, int width, int height,
                      cl_channel_type type)
     : texture_(texture), width_(width), height_(height), channel_type_(type) {}
@@ -95,6 +130,12 @@ void Texture2D::Release() {
   }
 }
 
+GPUResourcesWithValue Texture2D::GetGPUResources(AccessType access_type) const {
+  GPUResourcesWithValue resources;
+  resources.images2d.push_back({"tex2d", texture_});
+  return resources;
+}
+
 // Creates new 4-channel 2D texture with f32 elements
 absl::Status CreateTexture2DRGBA32F(int width, int height, CLContext* context,
                                     Texture2D* result) {
diff --git a/tensorflow/lite/delegates/gpu/cl/texture2d.h b/tensorflow/lite/delegates/gpu/cl/texture2d.h
index c12d8a2836c..cd41bb60aee 100644
--- a/tensorflow/lite/delegates/gpu/cl/texture2d.h
+++ b/tensorflow/lite/delegates/gpu/cl/texture2d.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "tensorflow/lite/delegates/gpu/cl/cl_command_queue.h"
 #include "tensorflow/lite/delegates/gpu/cl/cl_context.h"
+#include "tensorflow/lite/delegates/gpu/cl/gpu_object.h"
 #include "tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h"
 #include "tensorflow/lite/delegates/gpu/cl/tensor_type.h"
 #include "tensorflow/lite/delegates/gpu/cl/util.h"
@@ -30,9 +31,22 @@ namespace tflite {
 namespace gpu {
 namespace cl {
 
+struct Texture2DDescriptor : public GPUObjectDescriptor {
+  DataType element_type;  // FLOAT32 or FLOAT16
+
+  absl::Status PerformSelector(const std::string& selector,
+                               const std::vector<std::string>& args,
+                               const std::vector<std::string>& template_args,
+                               std::string* result) const override;
+
+  GPUResources GetGPUResources(AccessType access_type) const override;
+  absl::Status PerformReadSelector(const std::vector<std::string>& args,
+                                   std::string* result) const;
+};
+
 // Texture2D represent formatted GPU data storage.
 // Texture2D is moveable but not copyable.
-class Texture2D {
+class Texture2D : public GPUObject {
  public:
   Texture2D() {}  // just for using Texture2D as a class members
   Texture2D(cl_mem texture, int width, int height, cl_channel_type type);
@@ -56,6 +70,8 @@ class Texture2D {
   template <typename T>
   absl::Status ReadData(CLCommandQueue* queue, std::vector<T>* result) const;
 
+  GPUResourcesWithValue GetGPUResources(AccessType access_type) const override;
+
  private:
   void Release();
 
diff --git a/tensorflow/lite/delegates/gpu/common/model_builder.cc b/tensorflow/lite/delegates/gpu/common/model_builder.cc
index dc671a47691..8b5261cfd98 100644
--- a/tensorflow/lite/delegates/gpu/common/model_builder.cc
+++ b/tensorflow/lite/delegates/gpu/common/model_builder.cc
@@ -109,7 +109,7 @@ absl::Status IsActivationSupported(TfLiteFusedActivation fused_activation) {
   switch (fused_activation) {
     case kTfLiteActNone:
     case kTfLiteActRelu:
-    case kTfLiteActRelu1:
+    case kTfLiteActReluN1To1:
     case kTfLiteActRelu6:
     case kTfLiteActTanh:
       return absl::OkStatus();
@@ -140,12 +140,12 @@ absl::Status MaybeFuseActivation(TfLiteFusedActivation fused_activation,
   }
   switch (fused_activation) {
     case kTfLiteActRelu:
-    case kTfLiteActRelu1:
+    case kTfLiteActReluN1To1:
     case kTfLiteActRelu6: {
       ReLUAttributes attr;
       attr.clip = fused_activation == kTfLiteActRelu
                       ? 0.0f
-                      : (fused_activation == kTfLiteActRelu1 ? 1.0f : 6.0f);
+                      : (fused_activation == kTfLiteActReluN1To1 ? 1.0f : 6.0f);
       for (auto index : output_indices) {
         Node* activation_node;
         RETURN_IF_ERROR(
@@ -1271,7 +1271,7 @@ class MulOperationParser : public TFLiteOperationParser {
                                    GraphFloat32* graph, ObjectReader* reader) {
     RETURN_IF_ERROR(reader->AddInput(node, runtime_tensor));
     MultiplyAttributes attr;
-    if (constant_dims->size <= 0) {
+    if (constant_dims->size <= 0 || NumElements(constant_dims) == 1) {
       Tensor<Scalar, DataType::FLOAT32> tensor;
       RETURN_IF_ERROR(reader->ReadTensor(constant_tensor, &tensor));
       attr.param = tensor.data[0];
diff --git a/tensorflow/lite/delegates/hexagon/README.md b/tensorflow/lite/delegates/hexagon/README.md
index 226f3a61077..deff36e80b8 100644
--- a/tensorflow/lite/delegates/hexagon/README.md
+++ b/tensorflow/lite/delegates/hexagon/README.md
@@ -74,7 +74,7 @@ are verified in `IsNodeSupportedByHexagon`:
       - depth_multiplier == 1
       - dilation only supported when stride == 1
       - Otherwise, stride height/width <= 3
-* FullyConnected (without any activation)
+* FullyConnected
 * Hardswish
 * L2Normalization (without any activation)
 * Logistic (aka Sigmoid)
diff --git a/tensorflow/lite/delegates/hexagon/builders/conv_2d_builder.cc b/tensorflow/lite/delegates/hexagon/builders/conv_2d_builder.cc
index 97db6bf8fd0..cfddd2c2b97 100644
--- a/tensorflow/lite/delegates/hexagon/builders/conv_2d_builder.cc
+++ b/tensorflow/lite/delegates/hexagon/builders/conv_2d_builder.cc
@@ -16,6 +16,7 @@ limitations under the License.
 
 #include <stdint.h>
 
+#include <cmath>
 #include <limits>
 
 #include "tensorflow/lite/c/builtin_op_data.h"
@@ -196,8 +197,8 @@ TfLiteStatus Conv2dOpBuilder::PopulateSubGraph(const TfLiteIntArray* inputs,
   if (activation == kTfLiteActRelu6) {
     conv_output_min = 0;
     conv_output_max = 6;
-  } else if (activation == kTfLiteActRelu1) {
-    conv_output_min = 0;
+  } else if (activation == kTfLiteActReluN1To1) {
+    conv_output_min = -1;
     conv_output_max = 1;
   } else if (activation == kTfLiteActRelu) {
     conv_output_min = 0;
@@ -351,8 +352,12 @@ TfLiteStatus Conv2dOpBuilder::PopulateSubGraph(const TfLiteIntArray* inputs,
     output_max_tensor = AddOutput(sizeof(float), 4, kScalarShape);
   }
 
-  // Requantize if activation was not None.
-  if (activation != kTfLiteActNone) {
+  // Requantize if activation was not None & the TFLite tensor's min/max is
+  // different (diff > 1e-2) from the RELU bounds.
+  const float min_bound_diff = std::abs(conv_output_min - output_min);
+  const float max_bound_diff = std::abs(conv_output_max - output_max);
+  if (activation != kTfLiteActNone &&
+      (min_bound_diff > 0.01 || max_bound_diff > 0.01)) {
     auto* requantized_min_const = graph_builder_->AddConstNodeWithData(
         kScalarShape, reinterpret_cast<char*>(&output_min), sizeof(output_min));
     auto* requantized_max_const = graph_builder_->AddConstNodeWithData(
diff --git a/tensorflow/lite/delegates/hexagon/builders/matmul_builder.cc b/tensorflow/lite/delegates/hexagon/builders/matmul_builder.cc
index 0757ea6180e..6189294c3a1 100644
--- a/tensorflow/lite/delegates/hexagon/builders/matmul_builder.cc
+++ b/tensorflow/lite/delegates/hexagon/builders/matmul_builder.cc
@@ -151,7 +151,6 @@ TfLiteStatus AddFullyConnectedHelper(const TfLiteIntArray* inputs,
 // Data (8-bit), Weights (const, 8-bit) => MatMul => MatMul out (int32)
 // MatMul out (int32), Bias (int32) => QuantizedBiasAdd => BiasAdd out (int32)
 // BiasAdd out (int32) => Requantize_32to8 => Output (8-bit)
-// TODO(b/129276536): Add activation support.
 TfLiteStatus MatMulWithConstWeightsOpBuilder::PopulateSubGraph(
     const TfLiteIntArray* inputs, const TfLiteIntArray* outputs,
     TfLiteContext* context) {
diff --git a/tensorflow/lite/delegates/hexagon/builders/tests/conv_test.cc b/tensorflow/lite/delegates/hexagon/builders/tests/conv_test.cc
index 13fd768fded..eed1bf29aae 100644
--- a/tensorflow/lite/delegates/hexagon/builders/tests/conv_test.cc
+++ b/tensorflow/lite/delegates/hexagon/builders/tests/conv_test.cc
@@ -207,6 +207,43 @@ TEST(QuantizedConvolutionOpModel, SimpleConvTestReLU6Activation) {
                   1e-5)));
 }
 
+// Same as above, but the output min/max matches the RELU bounds.
+// Therefore, a Requantize node will not get added after Supernode.
+TEST(QuantizedConvolutionOpModel,
+     SimpleConvTestReLU6Activation_NoRequantizeRequired) {
+  QuantizedConvolutionOpModel m(
+      BuiltinOperator_CONV_2D, {TensorType_UINT8, {2, 2, 4, 1}, -63.5, 64},
+      {TensorType_UINT8, {3, 2, 2, 1}, -63.5, 64}, {TensorType_UINT8, {}, 0, 6},
+      Padding_VALID, /**dilation_factor**/ 1,
+      /**stride**/ 2, ActivationFunctionType_RELU6);
+  m.SetInput({
+      // First batch
+      1, 1, 1, 1,  // row = 1
+      2, 2, 2, 2,  // row = 2
+      // Second batch
+      1, 2, 3, 4,  // row = 1
+      1, 2, 3, 4,  // row = 2
+  });
+  m.SetFilter({
+      1, 2, 3, 4,    // first 2x2 filter
+      -1, 1, -1, 1,  // second 2x2 filter
+      -1, -1, 1, 1,  // third 2x2 filter
+  });
+  m.SetBias({1, 2, 3});
+
+  m.ApplyDelegateAndInvoke();
+
+  EXPECT_THAT(m.GetDequantizedOutput<uint8_t>(),
+              ElementsAreArray(ArrayFloatNear(
+                  {
+                      6, 2, 5,  // first batch, left
+                      6, 2, 5,  // first batch, right
+                      6, 4, 3,  // second batch, left
+                      6, 4, 3,  // second batch, right
+                  },
+                  2e-2)));
+}
+
 TEST(QuantizedConvolutionOpModel, SimplePerTensor_Int8) {
   QuantizedConvolutionOpModel m(
       BuiltinOperator_CONV_2D,
@@ -512,6 +549,53 @@ TEST(QuantizedConvolutionOpModel, DepthwiseConvSimplePerTensor_Int8) {
       ElementsAreArray(ArrayFloatNear({43, 48, 40, 52, 3, -4, 4, 4}, 0.6f)));
 }
 
+TEST(QuantizedConvolutionOpModel, DepthwiseConvSimplePerTensor_Int8_RELU1) {
+  QuantizedConvolutionOpModel m(
+      BuiltinOperator_DEPTHWISE_CONV_2D,
+      {TensorType_INT8, {1, 2, 3, 1}, -63.5, 64, 0.5, -1},
+      {TensorType_INT8,
+       // [1 * 2 * 2 * 4] as [input_channel, y, x, output_channel]
+       {1, 2, 2, 4},
+       0,
+       0,
+       0,
+       0,
+       /*per_channel_quantization=*/true,
+       /*per_channel_quantization_scales=*/{0.1, 2, 3, 0.4},
+       /*per_channel_quantization_offsets=*/{0, 0, 0, 0},
+       /*channel_index=*/3},
+      {TensorType_INT8, {}, -63.5, 64, 0.5, -1}, Padding_VALID,
+      /**dilation_factor**/ 1,
+      /**stride**/ 1, ActivationFunctionType_RELU_N1_TO_1);
+  m.SetInt8Input({
+      // [1 * 2 * 3 * 1] as [batch, y, x, input_channel]
+      3,   // batch = 0, y = 0, x = 0
+      1,   // batch = 0, y = 0, x = 1
+      -2,  // batch = 0, y = 0, x = 2
+      4,   // batch = 0, y = 1, x = 0
+      2,   // batch = 0, y = 1, x = 1
+      -4,  // batch = 0, y = 1, x = 2
+  });
+  m.SetPerChannelQuantizedFilter({
+      // [1 * 2 * 2 * 4] as [input_channel, y, x, output_channel]
+      // depth multiplier = 2
+      1, 2, 3, 4,  // y = 0, x = 0
+      3, 4, 5, 6,  // y = 0, x = 1
+      7, 8, 5, 6,  // y = 1, x = 0
+      3, 4, 1, 2,  // y = 1, x = 1
+  });
+  m.SetPerChannelQuantizedBias({3, -2, 4, 6});
+
+  // Reference output.
+  m.Invoke();
+  auto reference_output = m.GetDequantizedOutput<int8_t>();
+
+  m.ApplyDelegateAndInvoke();
+
+  EXPECT_THAT(m.GetDequantizedOutput<int8_t>(),
+              ElementsAreArray(ArrayFloatNear(reference_output, 1e-2)));
+}
+
 TEST(QuantizedConvolutionOpModel, DepthwiseConvSimplePerAxis_Int8) {
   QuantizedConvolutionOpModel m(
       BuiltinOperator_DEPTHWISE_CONV_2D,
diff --git a/tensorflow/lite/delegates/hexagon/utils.cc b/tensorflow/lite/delegates/hexagon/utils.cc
index 9253836a3b1..223d4a8a826 100644
--- a/tensorflow/lite/delegates/hexagon/utils.cc
+++ b/tensorflow/lite/delegates/hexagon/utils.cc
@@ -26,7 +26,7 @@ namespace {
 
 bool IsActivationReluOrNone(TfLiteFusedActivation activation) {
   return (activation == kTfLiteActRelu || activation == kTfLiteActRelu6 ||
-          activation == kTfLiteActRelu1 || activation == kTfLiteActNone);
+          activation == kTfLiteActReluN1To1 || activation == kTfLiteActNone);
 }
 
 bool TensorTypeMatch(int tensor_id, TfLiteContext* context,
diff --git a/tensorflow/lite/delegates/nnapi/BUILD b/tensorflow/lite/delegates/nnapi/BUILD
index ec9f6907f21..beeaff1b99d 100644
--- a/tensorflow/lite/delegates/nnapi/BUILD
+++ b/tensorflow/lite/delegates/nnapi/BUILD
@@ -190,6 +190,32 @@ cc_test(
     ],
 )
 
+cc_test(
+    name = "nnapi_delegate_signed_quantization_test",
+    size = "small",
+    srcs = [
+        "nnapi_delegate_signed_quantization_test.cc",
+    ],
+    tags = [
+        "no_mac",
+        "no_windows",
+        "tflite_not_portable_ios",
+    ],
+    deps = [
+        ":nnapi_delegate",
+        ":nnapi_delegate_mock_test",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite:kernel_api",
+        "//tensorflow/lite:minimal_logging",
+        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/kernels:test_util",
+        "//tensorflow/lite/nnapi:nnapi_implementation",
+        "//tensorflow/lite/nnapi:nnapi_lib",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
 cc_test(
     name = "quant_lstm_sup_test",
     size = "small",
diff --git a/tensorflow/lite/delegates/nnapi/acceleration_test_list.cc b/tensorflow/lite/delegates/nnapi/acceleration_test_list.cc
index b20628016f0..31bdc5f8b99 100644
--- a/tensorflow/lite/delegates/nnapi/acceleration_test_list.cc
+++ b/tensorflow/lite/delegates/nnapi/acceleration_test_list.cc
@@ -60,6 +60,10 @@ FloatActivationsOpTest/Elu,30
 FloatActivationsOpTest/HardSwish
 QuantizedActivationsOpTest/HardSwish
 QuantizedActivationsOpTest/HardSwishBias
+QuantizedActivationsOpTest/Relu*
+QuantizedActivationsOpTest/PRelu,29
+QuantizedActivationsOpTest/PReluSameShapes,29
+QuantizedActivationsOpTest/PReluInt8.+,30
 
 # add_test
 FloatAddOpModel/.+
@@ -145,6 +149,7 @@ ConvolutionOpTest/ConvolutionOpTest/.+/\d+
 
 # dequantize_test
 DequantizeOpTest/Uint8
+DequantizeOpTest/Int8,30
 
 # depth_to_space_test
 DepthToSpaceOpModel/Float32
@@ -190,6 +195,7 @@ QuantizedFullyConnectedOpTest/SimpleTestQuantizedOutputMultiplierGreaterThan1Uin
 QuantizedFullyConnectedOpTest/SimpleTestQuantizedOutputMultiplierGreaterThan1Int8/\d+,29
 HybridFullyConnectedOpTest/SimpleTestQuantizedUint8,29
 HybridFullyConnectedOpTest/SimpleTestQuantizedInt8,29
+HybridAsymmetricInputFullyConnectedOpTest.SimpleTestQuantizedUint8,29
 FloatFullyConnectedOpTest/FloatFullyConnectedOpTest/SimpleTest4DInput/\d+
 QuantizedFullyConnectedOpTest/QuantizedFullyConnectedOpTest/SimpleTest4dInputQuantizedUint8/\d+
 QuantizedFullyConnectedOpTest/QuantizedFullyConnectedOpTest/SimpleTest4dInputQuantizedOutputMultiplierGreaterThan1Uint8/\d+,29
@@ -207,6 +213,7 @@ FloatGatherOpTest/LastAxis,29
 TypesGatherOpTest/Float32Int32,29
 TypesGatherOpTest/Int32Int32,29
 TypesGatherOpTest/Uint8Int32,29
+TypesGatherOpTest/Int8Int32,29
 
 # hashtable_lookup_test
 # All test excepted the string one should be accelerated
@@ -286,13 +293,18 @@ QuantizedLstmTest/BasicQuantizedLstmTest/29
 
 # quantize_test
 QuantizeOpTest/UINT8,29
+QuantizeOpTest/INT8,30
+
+# rank
 
 # reduce_test
 -Dynamic.+(Mean|Sum|Prod|Max|Min)OpTest/.+
 -ConstUint8(Mean|Sum)OpTest/.+
+-ConstInt8MeanOpTest.NonSpecialAxisNonSameScale
+-ConstInt8MeanOpTest.QuantizedDifferentScale
 ConstUint8(Max|Min)OpTest/.+,29
 ConstUint8(Mean)OpTest/.+
-Constint8(Mean|Max|Min)OpTest/.+
+ConstInt8(Mean|Max|Min)OpTest/.+,29
 ConstFloat(Sum|Prod|Max|Min)OpTest/NotKeepDims,29
 ConstFloat(Sum|Prod|Max|Min)OpTest/KeepDims,29
 ConstFloat(Mean|Any)OpTest/NotKeepDims
diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc b/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
index a3a3f9fda4d..58ab13ab657 100644
--- a/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
@@ -160,6 +160,8 @@ bool IsScalarInputSupported(int builtin_code) {
     case kTfLiteBuiltinLess:
     case kTfLiteBuiltinLessEqual:
     case kTfLiteBuiltinPow:
+    case kTfLiteBuiltinMaximum:
+    case kTfLiteBuiltinMinimum:
       return true;
     default:
       return false;
@@ -199,6 +201,7 @@ bool NeedInt8Conversion(const TfLiteContext* context, int builtin_code,
     case kTfLiteBuiltinConcatenation:
     case kTfLiteBuiltinEqual:
     case kTfLiteBuiltinExpandDims:
+    case kTfLiteBuiltinGather:
     case kTfLiteBuiltinGreater:
     case kTfLiteBuiltinGreaterEqual:
     case kTfLiteBuiltinHardSwish:
@@ -375,6 +378,7 @@ bool HasZeroes(TfLiteIntArrayView array) {
 enum {
   NN_TENSOR_FLAG_SCALAR_AS_TENSOR = 1U << 0,
   NN_TENSOR_FLAG_INT8_CONVERSION = 1U << 1,
+  NN_TENSOR_FLAG_USE_INT8_ASYMM_SIGNED = 1U << 2,
 };
 
 // Returns the SDK level to target when delegating to the given devices.
@@ -1063,6 +1067,8 @@ class NNAPIOpBuilder {
         tensor_flags & NN_TENSOR_FLAG_SCALAR_AS_TENSOR;
     const bool need_int8_conversion =
         tensor_flags & NN_TENSOR_FLAG_INT8_CONVERSION;
+    const bool use_int8_asymm_signed =
+        tensor_flags & NN_TENSOR_FLAG_USE_INT8_ASYMM_SIGNED;
     int ann_tensor_index = operand_mapping_->lite_index_to_ann(tensor_index);
     if (ann_tensor_index != -1) {
       indices->push_back(ann_tensor_index);
@@ -1093,12 +1099,25 @@ class NNAPIOpBuilder {
         nn_type = ANEURALNETWORKS_TENSOR_FLOAT32;
         break;
       case kTfLiteUInt8:
+        nn_type = ANEURALNETWORKS_TENSOR_QUANT8_ASYMM;
+        scale = tensor->params.scale;
+        zeroPoint = tensor->params.zero_point;
+        if (scale == 0) {
+          // ANEURALNETWORKS_TENSOR_QUANT8_ASYMM with zero scale is not valid in
+          // NNAPI.
+          scale = 1;
+        }
+        break;
       case kTfLiteInt8:
         // If explicit int8 conversion is needed, we still need
         // ANEURALNETWORKS_TENSOR_QUANT8_ASYMM type.
-        nn_type = (tensor_type == kTfLiteUInt8 || need_int8_conversion)
-                      ? ANEURALNETWORKS_TENSOR_QUANT8_ASYMM
-                      : ANEURALNETWORKS_TENSOR_QUANT8_SYMM;
+        if (use_int8_asymm_signed) {
+          nn_type = ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED;
+        } else if (need_int8_conversion) {
+          nn_type = ANEURALNETWORKS_TENSOR_QUANT8_ASYMM;
+        } else {
+          nn_type = ANEURALNETWORKS_TENSOR_QUANT8_SYMM;
+        }
         scale = tensor->params.scale;
         zeroPoint = tensor->params.zero_point;
         if (tensor->quantization.type == kTfLiteAffineQuantization) {
@@ -1128,8 +1147,7 @@ class NNAPIOpBuilder {
             operand_mapping_->add_type_conversion(tensor_index, kTfLiteUInt8);
           }
           if (scale == 0) {
-            // TENSOR_QUANT8_ASYMM and ANEURALNETWORKS_TENSOR_QUANT8_ASYMM
-            // with zero scale are not valid in NNAPI.
+            // QUANT8 tensors with zero scale are not valid in NNAPI.
             scale = 1;
           }
         }
@@ -1246,7 +1264,6 @@ class NNAPIOpBuilder {
             "setting new operand value", nnapi_errno_);
       }
     }
-
     indices->push_back(ann_tensor_index);
     return kTfLiteOk;
   }
@@ -1435,7 +1452,6 @@ bool NNAPIDelegateKernel::Validate(
     bool is_accelerator_specified,
     std::vector<NNAPIValidationFailure>* map_failures) {
   OpValidationContext val_ctx{true, map_failures};
-
   switch (builtin_code) {
     case kTfLiteBuiltinAdd: {
       ExpectMaxOpVersion(version, 2, &val_ctx);
@@ -1787,18 +1803,21 @@ bool NNAPIDelegateKernel::Validate(
              "Supported op versions are 1 and 2 only", &val_ctx);
 
       const auto& input = context->tensors[node->inputs->data[0]];
-      Expect(input.type != kTfLiteFloat16,
-             NNAPIValidationFailureType::kUnsupportedInputType,
-             "kTfLiteFloat16 not supported as input", &val_ctx);
+      if (android_sdk_version < kMinSdkVersionForNNAPI12) {
+        EXPECT_INPUT_TYPE_IN(input.type, kTfLiteUInt8);
+      } else {
+        EXPECT_INPUT_TYPE_IN(input.type, kTfLiteUInt8, kTfLiteInt8);
 
-      const auto zero_point = input.params.zero_point;
-      Expect(input.type != kTfLiteInt8 ||
-                 (zero_point == 0 &&
-                  android_sdk_version >= kMinSdkVersionForNNAPI12),
-             NNAPIValidationFailureType::kUnsupportedInputType,
-             "NN API supports int8 type since version 1.2 but only for "
-             "symmetric quantization.",
-             &val_ctx);
+        if (android_sdk_version == kMinSdkVersionForNNAPI12 &&
+            input.type == kTfLiteInt8) {
+          const auto zero_point = input.params.zero_point;
+          Expect(zero_point == 0,
+                 NNAPIValidationFailureType::kUnsupportedInputType,
+                 "NN API supports int8 type since version 1.2 but only for "
+                 "symmetric quantization.",
+                 &val_ctx);
+        }
+      }
     } break;
     case kTfLiteBuiltinFloor: {
       ExpectOpVersion(version, 1, &val_ctx);
@@ -2148,21 +2167,38 @@ bool NNAPIDelegateKernel::Validate(
                                  &val_ctx);
       const TfLiteType input_type =
           context->tensors[node->inputs->data[0]].type;
-      EXPECT_INPUT_TYPE_IN(input_type, kTfLiteFloat32, kTfLiteInt32,
-                           kTfLiteUInt8);
       const TfLiteType output_type =
           context->tensors[node->outputs->data[0]].type;
-      ExpectTypeIn(output_type, {kTfLiteFloat32, kTfLiteInt32, kTfLiteUInt8},
-                   NNAPIValidationFailureType::kUnsupportedOutputType,
-                   "Output type should be one of kTfLiteFloat32, kTfLiteInt32, "
-                   "kTfLiteUInt8.",
-                   &val_ctx);
+      if (android_sdk_version >= kMinSdkVersionForNNAPI13) {
+        EXPECT_INPUT_TYPE_IN(input_type, kTfLiteFloat32, kTfLiteInt32,
+                             kTfLiteUInt8, kTfLiteInt8);
+
+        ExpectTypeIn(
+            output_type,
+            {kTfLiteFloat32, kTfLiteInt32, kTfLiteUInt8, kTfLiteInt8},
+            NNAPIValidationFailureType::kUnsupportedOutputType,
+            "Output type should be one of kTfLiteFloat32, kTfLiteInt32, "
+            "kTfLiteUInt8, kTfLiteInt8.",
+            &val_ctx);
+      } else {
+        EXPECT_INPUT_TYPE_IN(input_type, kTfLiteFloat32, kTfLiteInt32,
+                             kTfLiteUInt8);
+
+        ExpectTypeIn(
+            output_type, {kTfLiteFloat32, kTfLiteInt32, kTfLiteUInt8},
+            NNAPIValidationFailureType::kUnsupportedOutputType,
+            "Output type should be one of kTfLiteFloat32, kTfLiteInt32, "
+            "kTfLiteUInt8.",
+            &val_ctx);
+      }
     } break;
     case kTfLiteBuiltinPrelu: {
       ExpectOpVersion(version, 1, &val_ctx);
       ExpectMinAndroidSdkVersion(android_sdk_version, kMinSdkVersionForNNAPI12,
                                  &val_ctx);
-      ExpectIsFloatOrUint8Operator(context, node, &val_ctx);
+      const auto input_type = context->tensors[node->inputs->data[0]].type;
+      EXPECT_INPUT_TYPE_IN(input_type, kTfLiteFloat32, kTfLiteUInt8,
+                           kTfLiteInt8);
     } break;
     case kTfLiteBuiltinTile: {
       ExpectOpVersion(version, 1, &val_ctx);
@@ -2238,19 +2274,18 @@ bool NNAPIDelegateKernel::Validate(
              &val_ctx);
     } break;
     case kTfLiteBuiltinGather: {
-      ExpectOpVersion(version, 1, &val_ctx);
+      ExpectOpVersion(version, 2, &val_ctx);
       ExpectMinAndroidSdkVersion(android_sdk_version, kMinSdkVersionForNNAPI12,
                                  &val_ctx);
       const auto input_type = context->tensors[node->inputs->data[0]].type;
       const auto& positions = context->tensors[node->inputs->data[1]];
+
       EXPECT_INPUT_TYPE_IN(input_type, kTfLiteFloat32, kTfLiteFloat16,
-                           kTfLiteInt32, kTfLiteUInt8);
-      ExpectTypeIn(positions.type,
-                   {kTfLiteFloat32, kTfLiteFloat16, kTfLiteInt32, kTfLiteUInt8},
-                   NNAPIValidationFailureType::kUnsupportedInputType,
-                   "Positions type should be one of kTfLiteFloat32, "
-                   "kTfLiteFloat16, kTfLiteInt32, kTfLiteUInt8",
-                   &val_ctx);
+                           kTfLiteInt32, kTfLiteUInt8, kTfLiteInt8);
+
+      Expect(positions.type == kTfLiteInt32,
+             NNAPIValidationFailureType::kUnsupportedInputType,
+             "Positions type should be one of kTfLiteInt32", &val_ctx);
       Expect(positions.dims->size != 0,
              NNAPIValidationFailureType::kUnsupportedOperandRank,
              "0-dimension args are not supported by NNAPI.", &val_ctx);
@@ -2281,8 +2316,13 @@ bool NNAPIDelegateKernel::Validate(
                                  &val_ctx);
       // Tensor indices: split_dim: 0, value: 1
       const TfLiteTensor& input = context->tensors[node->inputs->data[1]];
-      EXPECT_INPUT_TYPE_IN(input.type, kTfLiteFloat32, kTfLiteUInt8,
-                           kTfLiteInt32);
+      if (android_sdk_version >= kMinSdkVersionForNNAPI13) {
+        EXPECT_INPUT_TYPE_IN(input.type, kTfLiteFloat32, kTfLiteUInt8,
+                             kTfLiteInt8, kTfLiteInt32);
+      } else {
+        EXPECT_INPUT_TYPE_IN(input.type, kTfLiteFloat32, kTfLiteUInt8,
+                             kTfLiteInt32);
+      }
       const TfLiteTensor& axis = context->tensors[node->inputs->data[0]];
       Expect(axis.type == kTfLiteInt32 && axis.allocation_type == kTfLiteMmapRo,
              NNAPIValidationFailureType::kUnsupportedInputType,
@@ -2306,30 +2346,41 @@ bool NNAPIDelegateKernel::Validate(
              NNAPIValidationFailureType::kUnsupportedInputType,
              "Value should be Float32.", &val_ctx);
       const auto output_type = context->tensors[node->outputs->data[0]].type;
-      Expect(output_type == kTfLiteUInt8,
-             NNAPIValidationFailureType::kUnsupportedOutputType,
-             "Output should be kTfLiteUInt8.", &val_ctx);
+      if (android_sdk_version < kMinSdkVersionForNNAPI13) {
+        Expect(output_type == kTfLiteUInt8,
+               NNAPIValidationFailureType::kUnsupportedOutputType,
+               "Output should be kTfLiteUInt8.", &val_ctx);
+      } else {
+        ExpectTypeIn(output_type, {kTfLiteUInt8, kTfLiteInt8},
+                     NNAPIValidationFailureType::kUnsupportedOutputType,
+                     "Output should be kTfLiteUInt8.", &val_ctx);
+      }
       const auto quantization_params =
           context->tensors[node->outputs->data[0]].params;
       Expect(quantization_params.scale > 0.f,
              NNAPIValidationFailureType::kUnsupportedQuantizationParameters,
              "Quantization scale should be > 0.", &val_ctx);
     } break;
-    case kTfLiteBuiltinReduceAny:
-    case kTfLiteBuiltinReduceMin:
-    case kTfLiteBuiltinReduceMax: {
-      ExpectOpVersion(version, 1, &val_ctx);
+    case kTfLiteBuiltinReduceAny: {
+      ExpectOpVersion(version, 2, &val_ctx);
       ExpectMinAndroidSdkVersion(android_sdk_version, kMinSdkVersionForNNAPI12,
                                  &val_ctx);
       Expect(context->tensors[node->outputs->data[0]].dims->size != 0,
              NNAPIValidationFailureType::kUnsupportedOutputType,
              "NNAPI does not support generating a scalar as output.", &val_ctx);
-      if (builtin_code == kTfLiteBuiltinReduceProd) {
-        const auto input_type = context->tensors[node->inputs->data[0]].type;
-        Expect(input_type == kTfLiteFloat32,
-               NNAPIValidationFailureType::kUnsupportedInputType,
-               "NNAPI only supports floating point REDUCE_PROD.", &val_ctx);
-      }
+    } break;
+    case kTfLiteBuiltinReduceMin:
+    case kTfLiteBuiltinReduceMax: {
+      ExpectMaxOpVersion(version, 2, &val_ctx);
+      ExpectMinAndroidSdkVersion(android_sdk_version, kMinSdkVersionForNNAPI12,
+                                 &val_ctx);
+      const auto input_tensor = context->tensors[node->inputs->data[0]];
+      const auto input_type = input_tensor.type;
+      EXPECT_INPUT_TYPE_IN(input_type, kTfLiteFloat32, kTfLiteUInt8,
+                           kTfLiteInt8);
+      Expect(input_tensor.dims->size != 0,
+             NNAPIValidationFailureType::kUnsupportedOutputType,
+             "NNAPI does not support generating a scalar as output.", &val_ctx);
     } break;
     case kTfLiteBuiltinDepthToSpace: {
       const TfLiteType input_type =
@@ -3091,16 +3142,10 @@ TfLiteStatus NNAPIDelegateKernel::Map(
     case kTfLiteBuiltinGather: {
       auto builtin = reinterpret_cast<TfLiteGatherParams*>(
           mapping_args.node->builtin_data);
-      mapping_args.builder->AddTensorInput(mapping_args.node->inputs->data[0],
-                                           /* hybrid_op */ false,
-                                           /* scalar_as_tensor */ false);
-
       mapping_args.builder->AddScalarInt32Operand(builtin->axis);
-
       mapping_args.builder->AddTensorInput(mapping_args.node->inputs->data[1],
                                            /* hybrid_op */ false,
-                                           /* scalar_as_tensor */ false);
-
+                                           /* tensor_flags */ 0);
       *nn_op_type = ANEURALNETWORKS_GATHER;
     } break;
     case kTfLiteBuiltinBidirectionalSequenceLstm: {
@@ -3428,6 +3473,9 @@ TfLiteStatus NNAPIDelegateKernel::Invoke(TfLiteContext* context,
   // absolute indices but NN api indices inputs by relative indices.
   int relative_input_index = 0;
 
+  const bool use_int8_asymm_signed =
+      target_sdk_version_ >= kMinSdkVersionForNNAPI13;
+
   size_t input_offset = 0;
   for (auto absolute_input_index : TfLiteIntArrayView(node->inputs)) {
     if (absolute_input_index == kTfLiteOptionalTensor) {
@@ -3470,9 +3518,16 @@ TfLiteStatus NNAPIDelegateKernel::Invoke(TfLiteContext* context,
           }
         } else if (tensor->type == kTfLiteInt8 &&
                    ann_type_equivalent == kTfLiteInt32) {
-          for (int i = 0; i < num_elements; ++i) {
-            reinterpret_cast<int32_t*>(input_ptr)[i] =
-                static_cast<const int32_t>(tensor->data.int8[i]) + 128;
+          if (use_int8_asymm_signed) {
+            for (int i = 0; i < num_elements; ++i) {
+              reinterpret_cast<int32_t*>(input_ptr)[i] =
+                  static_cast<const int32_t>(tensor->data.int8[i]);
+            }
+          } else {
+            for (int i = 0; i < num_elements; ++i) {
+              reinterpret_cast<int32_t*>(input_ptr)[i] =
+                  static_cast<const int32_t>(tensor->data.int8[i]) + 128;
+            }
           }
         } else {
           context->ReportError(
@@ -3683,6 +3738,15 @@ TfLiteStatus NNAPIDelegateKernel::AddOpsAndTensors(TfLiteContext* context,
                          &dequantize_mapping, &allocation_memory_mapping_,
                          &nnapi_to_tflite_op_mapping_, nn_model_.get(),
                          nnapi_errno);
+
+  // If we have target accelerators the target SDK version might be
+  // different than the current android version.
+  target_sdk_version_ = nnapi_->android_sdk_version;
+  if (!nnapi_devices_.empty()) {
+    TF_LITE_ENSURE_STATUS(GetTargetSdkVersion(
+        context, nnapi_, nnapi_devices_, &target_sdk_version_, nnapi_errno));
+  }
+
   // Add Tensors.
   for (auto node_index : nodes_) {
     // Obtain the op and registration.
@@ -3694,11 +3758,18 @@ TfLiteStatus NNAPIDelegateKernel::AddOpsAndTensors(TfLiteContext* context,
     const bool hybrid_op = IsHybridOperator(context, reg->builtin_code, node);
     const bool scalar_as_tensor = IsScalarInputSupported(reg->builtin_code);
     const bool need_int8_conversion =
+        target_sdk_version_ < kMinSdkVersionForNNAPI13 &&
         NeedInt8Conversion(context, reg->builtin_code, node);
+    const bool use_int8_asymm_signed =
+        target_sdk_version_ >= kMinSdkVersionForNNAPI13 && !hybrid_op;
+
     int input_tensor_flags = 0;
     if (scalar_as_tensor) {
       input_tensor_flags |= NN_TENSOR_FLAG_SCALAR_AS_TENSOR;
     }
+    if (use_int8_asymm_signed) {
+      input_tensor_flags |= NN_TENSOR_FLAG_USE_INT8_ASYMM_SIGNED;
+    }
 
     // On SDK level less than 30, h_swish will be lowered into supported NNAPI
     // operations. Since SDK level 30, h_swish is supported as a single
@@ -3805,8 +3876,12 @@ TfLiteStatus NNAPIDelegateKernel::AddOpsAndTensors(TfLiteContext* context,
             break;
           case kTfLiteInt8:
             if (constant_value.allocation_type == kTfLiteMmapRo) {
-              builder.AddScalarInt32Operand(
-                  static_cast<int32_t>(*constant_value.data.int8) + 128);
+              if (need_int8_conversion) {
+                builder.AddScalarInt32Operand(
+                    static_cast<int32_t>(*constant_value.data.int8) + 128);
+              } else {
+                builder.AddScalarInt32Operand(*constant_value.data.int8);
+              }
             } else {
               builder.AddSingleValueTensorAsScalarOperand(
                   constant_value_id, ANEURALNETWORKS_INT32);
@@ -3834,7 +3909,8 @@ TfLiteStatus NNAPIDelegateKernel::AddOpsAndTensors(TfLiteContext* context,
           // specifying the output height and width, is not added and
           // instead the height and width will be added individually as
           // scalars by the mapping function returned by Map().
-          TF_LITE_ENSURE_STATUS(builder.AddTensorInput(input_index, hybrid_op));
+          TF_LITE_ENSURE_STATUS(builder.AddTensorInput(input_index, hybrid_op,
+                                                       input_tensor_flags));
         }
       } else if (reg->builtin_code == kTfLiteBuiltinTopkV2 && input_pos > 0) {
         // The K parameter tensor is not handled here but by the functor
@@ -3842,8 +3918,12 @@ TfLiteStatus NNAPIDelegateKernel::AddOpsAndTensors(TfLiteContext* context,
         // the else clause below
         continue;
       } else if (reg->builtin_code == kTfLiteBuiltinGather) {
-        // Everything is added during Map since input tensors
+        // Everything else is added during Map since input tensors
         // have different order.
+        if (input_pos == 0) {
+          TF_LITE_ENSURE_STATUS(builder.AddTensorInput(input_index, hybrid_op,
+                                                       input_tensor_flags));
+        }
         continue;
       } else if (reg->builtin_code == kTfLiteBuiltinExpandDims &&
                  input_pos == 1) {
@@ -3860,7 +3940,8 @@ TfLiteStatus NNAPIDelegateKernel::AddOpsAndTensors(TfLiteContext* context,
         // the axis, needs to be converted to a scalar since TFLite uses a
         // tensor but NNAPI uses a scalar as the axis.
         if (input_pos == 0) {
-          TF_LITE_ENSURE_STATUS(builder.AddTensorInput(input_index, hybrid_op));
+          TF_LITE_ENSURE_STATUS(builder.AddTensorInput(input_index, hybrid_op,
+                                                       input_tensor_flags));
         } else {
           const int axis_id = node->inputs->data[1];
           const TfLiteTensor& axis_tensor = context->tensors[axis_id];
@@ -3906,12 +3987,26 @@ TfLiteStatus NNAPIDelegateKernel::AddOpsAndTensors(TfLiteContext* context,
                   std::vector<uint8_t>(1, operand_tensor.data.uint8[0]),
                   operand_tensor.params, &tensor_index));
               break;
-            case kTfLiteInt8:
-              TF_LITE_ENSURE_STATUS(builder.AddNewInputConstantTensor(
-                  ANEURALNETWORKS_TENSOR_QUANT8_SYMM, operand_tensor.type, {1},
-                  std::vector<int8_t>(1, operand_tensor.data.int8[0]),
-                  operand_tensor.params, &tensor_index));
-              break;
+            case kTfLiteInt8: {
+              auto params = operand_tensor.params;
+              if (params.scale == 0.0) {
+                params.scale = 1.0;
+              }
+
+              if (use_int8_asymm_signed) {
+                TF_LITE_ENSURE_STATUS(builder.AddNewInputConstantTensor(
+                    ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED,
+                    operand_tensor.type, {1},
+                    std::vector<int8_t>(1, operand_tensor.data.int8[0]), params,
+                    &tensor_index));
+              } else {
+                TF_LITE_ENSURE_STATUS(builder.AddNewInputConstantTensor(
+                    ANEURALNETWORKS_TENSOR_QUANT8_ASYMM, operand_tensor.type,
+                    {1},
+                    std::vector<int8_t>(1, operand_tensor.data.int8[0] + 128),
+                    params, &tensor_index));
+              }
+            } break;
             case kTfLiteInt32:
               TF_LITE_ENSURE_STATUS(builder.AddNewInputConstantTensor(
                   ANEURALNETWORKS_TENSOR_INT32, operand_tensor.type, {1},
@@ -3993,19 +4088,11 @@ TfLiteStatus NNAPIDelegateKernel::AddOpsAndTensors(TfLiteContext* context,
       }
     }
 
-    // If we have target accelerators the target SDK version might be
-    // different than the current android version.
-    int target_sdk_version = nnapi_->android_sdk_version;
-    if (!nnapi_devices_.empty()) {
-      TF_LITE_ENSURE_STATUS(GetTargetSdkVersion(
-          context, nnapi_, nnapi_devices_, &target_sdk_version, nnapi_errno));
-    }
-
     // Get op type and operands
     // Fails if the Validate function failed
     int nn_op_type;
     TF_LITE_ENSURE_STATUS(
-        Map(context, reg->builtin_code, reg->version, target_sdk_version,
+        Map(context, reg->builtin_code, reg->version, target_sdk_version_,
             {context, &builder, node, &model_state_outputs_,
              &model_state_tfl_inputs_, &feedback_loops_, nnapi_errno},
             &nn_op_type));
@@ -4015,6 +4102,9 @@ TfLiteStatus NNAPIDelegateKernel::AddOpsAndTensors(TfLiteContext* context,
     if (need_int8_conversion) {
       output_tensor_flags |= NN_TENSOR_FLAG_INT8_CONVERSION;
     }
+    if (use_int8_asymm_signed) {
+      output_tensor_flags |= NN_TENSOR_FLAG_USE_INT8_ASYMM_SIGNED;
+    }
     for (int output_pos = 0; output_pos < node->outputs->size; ++output_pos) {
       const auto output_index = node->outputs->data[output_pos];
 
diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate_kernel.h b/tensorflow/lite/delegates/nnapi/nnapi_delegate_kernel.h
index 26822c011e3..9aa0f303cc2 100644
--- a/tensorflow/lite/delegates/nnapi/nnapi_delegate_kernel.h
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate_kernel.h
@@ -341,6 +341,9 @@ class NNAPIDelegateKernel {
 
   std::vector<int> nnapi_to_tflite_op_mapping_;
 
+  // Fully initialized in NNAPIDelegateKernel::AddOpsAndTensors
+  int target_sdk_version_ = 27;  // kMinSdkVersionForNNAPI13
+
   void AddDequantizeOperatorsWhereNeeded(
       const TfLiteContext* context, int builtin_code, const TfLiteNode* node,
       int tflite_node_index, NNAPIOpBuilder* builder, int* nnapi_errno);
diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate_mock_test.h b/tensorflow/lite/delegates/nnapi/nnapi_delegate_mock_test.h
index fa7ff9dd1f1..5dbe4110131 100644
--- a/tensorflow/lite/delegates/nnapi/nnapi_delegate_mock_test.h
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate_mock_test.h
@@ -71,6 +71,8 @@ class NnApiMock : public ::tflite::nnapi::NnApiHandler {
     ExecutionComputeReturns<ANEURALNETWORKS_NO_ERROR>();
     ExecutionStartComputeReturns<ANEURALNETWORKS_NO_ERROR>();
     EventWaitReturns<ANEURALNETWORKS_NO_ERROR>();
+    SetPriorityReturns<ANEURALNETWORKS_NO_ERROR>();
+    SetOperandSymmPerChannelQuantParamsReturns<ANEURALNETWORKS_NO_ERROR>();
     SetNnapiSupportedDevice("test-device", android_sdk_version);
   }
 
diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate_signed_quantization_test.cc b/tensorflow/lite/delegates/nnapi/nnapi_delegate_signed_quantization_test.cc
new file mode 100644
index 00000000000..b9d702015c2
--- /dev/null
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate_signed_quantization_test.cc
@@ -0,0 +1,920 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <gtest/gtest.h>
+#include "tensorflow/lite/builtin_ops.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/delegates/nnapi/nnapi_delegate.h"
+#include "tensorflow/lite/delegates/nnapi/nnapi_delegate_mock_test.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/fully_connected.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/minimal_logging.h"
+#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/nnapi/NeuralNetworksTypes.h"
+#include "tensorflow/lite/nnapi/nnapi_implementation.h"
+
+namespace tflite {
+
+namespace ops {
+namespace builtin {
+
+TfLiteRegistration* Register_CONVOLUTION_REF();
+TfLiteRegistration* Register_DEQUANTIZE();
+
+}  // namespace builtin
+}  // namespace ops
+
+namespace {
+
+class SingleOpModelWithNNAPI : public SingleOpModel {
+ public:
+  SingleOpModelWithNNAPI() = default;
+  void Init(const NnApi* nnapi) {
+    stateful_delegate_.reset(new StatefulNnApiDelegate(nnapi));
+    SetDelegate(stateful_delegate_.get());
+  }
+
+  StatefulNnApiDelegate* GetDelegate() { return stateful_delegate_.get(); }
+
+  void SetBufferHandle(int index, TfLiteBufferHandle handle) {
+    interpreter_->SetBufferHandle(index, handle, stateful_delegate_.get());
+  }
+  TfLiteStatus GetCompilationStatus() { return compilation_status_; }
+
+ protected:
+  std::unique_ptr<StatefulNnApiDelegate> stateful_delegate_;
+  TfLiteStatus compilation_status_;
+};
+
+class HybridFullyConnectedOpModel : public SingleOpModelWithNNAPI {
+ public:
+  HybridFullyConnectedOpModel(const NnApi* nnapi, int units, int batches,
+                              const TensorData& input,
+                              const TensorData& weights,
+                              const TensorData& output = {TensorType_FLOAT32},
+                              bool asymmetric_inputs = false)
+      : batches_(batches), units_(units) {
+    SingleOpModelWithNNAPI::Init(nnapi);
+    int total_input_size = 1;
+    for (size_t i = 0; i < input.shape.size(); ++i) {
+      total_input_size *= input.shape[i];
+    }
+    input_size_ = total_input_size / batches_;
+
+    input_ = AddInput(input);
+    weights_ = AddInput(weights);
+
+    TensorData bias{TensorType_FLOAT32, {units_}};
+    bias_ = AddInput(bias);
+
+    output_ = AddOutput(output);
+
+    auto options = CreateFullyConnectedOptions(
+                       builder_, ActivationFunctionType_RELU,
+                       tflite::FullyConnectedOptionsWeightsFormat_DEFAULT,
+                       false, asymmetric_inputs)
+                       .Union();
+    SetBuiltinOp(BuiltinOperator_FULLY_CONNECTED,
+                 BuiltinOptions_FullyConnectedOptions, options);
+    resolver_ = absl::make_unique<SingleOpResolver>(
+        BuiltinOperator_FULLY_CONNECTED,
+        ops::builtin::Register_FULLY_CONNECTED_PIE());
+    BuildInterpreter({GetShape(input_), GetShape(weights_), GetShape(bias_)},
+                     /*num_threads=*/-1,
+                     /* allow_fp32_relax_to_fp16 */ false,
+                     /*apply_delegate=*/false);
+    compilation_status_ = ApplyDelegate();
+  }
+  void SetBias(const std::vector<float>& f) { PopulateTensor(bias_, f); }
+  void SetWeights(const std::vector<float>& data) {
+    SymmetricQuantizeAndPopulate(weights_, data);
+  }
+  void SetSignedWeights(std::initializer_list<float> f) {
+    SignedSymmetricQuantizeAndPopulate(weights_, f);
+  }
+
+  void SetInput(const std::vector<float>& f) { PopulateTensor(input_, f); }
+  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+  std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
+
+  int input_size() { return input_size_; }
+  int num_units() { return units_; }
+  int num_batches() { return batches_; }
+
+ protected:
+  int input_;
+  int weights_;
+  int bias_;
+  int output_;
+
+  int batches_;
+  int units_;
+  int input_size_;
+};
+
+struct NnApiSignedQuantizationTest
+    : ::tflite::delegate::nnapi::NnApiDelegateMockTest {
+  static void SetUpTestSuite() { tensors_count = new std::map<int, int>(); }
+  void SetUp() override {
+    ::tflite::delegate::nnapi::NnApiDelegateMockTest::SetUp();
+    nnapi_mock_->StubAddOperandWith(
+        [](ANeuralNetworksModel* model,
+           const ANeuralNetworksOperandType* type) -> int {
+          const auto nn_tensor_type = type->type;
+          if (tensors_count->find(nn_tensor_type) == tensors_count->end()) {
+            tensors_count->insert({nn_tensor_type, 0});
+          }
+          tensors_count->at(nn_tensor_type)++;
+          return ANEURALNETWORKS_NO_ERROR;
+        });
+  }
+  void TearDown() override { tensors_count->clear(); }
+  static void TearDownTestSuite() {
+    delete tensors_count;
+    tensors_count = nullptr;
+  }
+  static std::map<int, int>* tensors_count;
+};
+std::map<int, int>* NnApiSignedQuantizationTest::tensors_count = nullptr;
+
+TEST_F(NnApiSignedQuantizationTest,
+       HybridFullyConnectedMapsToSignedSymmOnSdk29) {
+  nnapi_mock_->SetAndroidSdkVersion(29);
+
+  HybridFullyConnectedOpModel m(
+      nnapi_mock_->GetNnApi(), /*units=*/3, /*batches=*/2,
+      /*input=*/{TensorType_FLOAT32, {2, 10}},
+      /*weights=*/{TensorType_INT8, {3, 10}, 0, 0, 10.0 / 127.0, 0});
+  m.SetSignedWeights({
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 0
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 1
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 2
+  });
+  m.SetBias({1, 2, 3});
+  m.SetInput({
+      1, 2, 3, 4, 5, 6, 7, 8,  -9, -10,  // b = 0
+      1, 2, 3, 4, 5, 6, 7, -8, 9,  -10,  // b = 1
+  });
+
+  m.Invoke();
+  EXPECT_EQ(m.GetCompilationStatus(), kTfLiteOk);
+
+  ASSERT_EQ(tensors_count->size(), 3);
+  ASSERT_NE(tensors_count->find(ANEURALNETWORKS_TENSOR_FLOAT32),
+            tensors_count->end());
+  ASSERT_NE(tensors_count->find(ANEURALNETWORKS_INT32), tensors_count->end());
+  ASSERT_NE(tensors_count->find(ANEURALNETWORKS_TENSOR_QUANT8_SYMM),
+            tensors_count->end());
+
+  EXPECT_EQ(tensors_count->at(ANEURALNETWORKS_TENSOR_FLOAT32),
+            4);  // fc_input, fc_weights, fc_bias, fc_output
+  EXPECT_EQ(tensors_count->at(ANEURALNETWORKS_INT32), 1);  // activation
+  EXPECT_EQ(tensors_count->at(ANEURALNETWORKS_TENSOR_QUANT8_SYMM),
+            1);  // dequantize_weights_input
+}
+
+TEST_F(NnApiSignedQuantizationTest,
+       HybridFullyConnectedMapsToSignedSymmOnSdk30) {
+  nnapi_mock_->SetAndroidSdkVersion(30);
+
+  HybridFullyConnectedOpModel m(
+      nnapi_mock_->GetNnApi(), /*units=*/3, /*batches=*/2,
+      /*input=*/{TensorType_FLOAT32, {2, 10}},
+      /*weights=*/{TensorType_INT8, {3, 10}, 0, 0, 10.0 / 127.0, 0});
+  m.SetSignedWeights({
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 0
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 1
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 2
+  });
+  m.SetBias({1, 2, 3});
+  m.SetInput({
+      1, 2, 3, 4, 5, 6, 7, 8,  -9, -10,  // b = 0
+      1, 2, 3, 4, 5, 6, 7, -8, 9,  -10,  // b = 1
+  });
+
+  m.Invoke();
+  EXPECT_EQ(m.GetCompilationStatus(), kTfLiteOk);
+
+  ASSERT_EQ(tensors_count->size(), 3);
+  ASSERT_NE(tensors_count->find(ANEURALNETWORKS_TENSOR_FLOAT32),
+            tensors_count->end());
+  ASSERT_NE(tensors_count->find(ANEURALNETWORKS_INT32), tensors_count->end());
+  ASSERT_NE(tensors_count->find(ANEURALNETWORKS_TENSOR_QUANT8_SYMM),
+            tensors_count->end());
+
+  EXPECT_EQ(tensors_count->at(ANEURALNETWORKS_TENSOR_FLOAT32),
+            4);  // fc_input, fc_weights, fc_bias, fc_output
+  EXPECT_EQ(tensors_count->at(ANEURALNETWORKS_INT32), 1);  // activation
+  EXPECT_EQ(tensors_count->at(ANEURALNETWORKS_TENSOR_QUANT8_SYMM),
+            1);  // dequantize_weights_input
+}
+
+template <typename FilterType>
+class BaseConvolutionOpModel : public SingleOpModelWithNNAPI {
+ public:
+  BaseConvolutionOpModel(
+      const NnApi* nnapi, TfLiteRegistration* registration,
+      const TensorData& input, const TensorData& filter,
+      const TensorData& output, int stride_width = 2, int stride_height = 2,
+      enum Padding padding = Padding_VALID,
+      enum ActivationFunctionType activation = ActivationFunctionType_NONE,
+      int dilation_width_factor = 1, int dilation_height_factor = 1,
+      std::initializer_list<FilterType> filter_data = {}) {
+    SingleOpModelWithNNAPI::Init(nnapi);
+
+    input_ = AddInput(input);
+
+    if (filter_data.size()) {
+      filter_ = AddConstInput(filter, filter_data);
+    } else {
+      filter_ = AddInput(filter);
+    }
+
+    int bias_size = GetShape(filter_)[0];
+    if (input.type == TensorType_FLOAT32) {
+      bias_ = AddInput({TensorType_FLOAT32, {bias_size}});
+    } else {
+      // This is a quantized version. The scale of 'bias' depends on the scales
+      // of input and filter. Supposedly this is correctly set during quantized
+      // training.
+      if (filter.per_channel_quantization) {
+        // per channel quantization.
+        std::vector<float> bias_scale(
+            filter.per_channel_quantization_scales.size());
+        std::vector<int64_t> bias_zero_points(
+            filter.per_channel_quantization_scales.size());
+        for (size_t i = 0; i < filter.per_channel_quantization_scales.size();
+             ++i) {
+          bias_scale[i] =
+              input.scale * filter.per_channel_quantization_scales[i];
+          bias_zero_points[i] = 0;
+        }
+        tflite::TensorType bias_type = TensorType_INT32;
+        if (input.type == TensorType_INT16) {
+          // In case of 16-bit, the bias type is set to be int 64.
+          bias_type = TensorType_INT64;
+        }
+        TensorData bias{bias_type,
+                        {bias_size},
+                        /*min=*/0,
+                        /*max=*/0,
+                        /*scale=*/0,
+                        /*zero_point=*/0,
+                        true,
+                        /*per_channel_quantization_scales=*/bias_scale,
+                        /*per_channel_quantization_offsets=*/bias_zero_points,
+                        /*channel_index==*/0};
+        bias_ = AddInput(bias);
+      } else {
+        // per tensor quantization.
+        auto bias_scale = GetScale(input_) * GetScale(filter_);
+        TensorData bias{TensorType_INT32, {bias_size}, 0, 0, bias_scale};
+        bias_ = AddInput(bias);
+      }
+    }
+
+    output_ = AddOutput(output);
+
+    SetBuiltinOp(BuiltinOperator_CONV_2D, BuiltinOptions_Conv2DOptions,
+                 CreateConv2DOptions(
+                     builder_, padding, stride_width, stride_height, activation,
+                     dilation_width_factor, dilation_height_factor)
+                     .Union());
+
+    resolver_ = absl::make_unique<SingleOpResolver>(BuiltinOperator_CONV_2D,
+                                                    registration);
+    BuildInterpreter({GetShape(input_), GetShape(filter_), GetShape(bias_)},
+                     /*num_threads=*/-1,
+                     /* allow_fp32_relax_to_fp16 */ false,
+                     /*apply_delegate=*/false);
+    compilation_status_ = ApplyDelegate();
+  }
+
+ protected:
+  int input_;
+  int filter_;
+  int bias_;
+  int output_;
+};
+
+class QuantizedConvolutionOpModel : public BaseConvolutionOpModel<uint8_t> {
+ public:
+  using BaseConvolutionOpModel::BaseConvolutionOpModel;
+
+  void SetInput(std::initializer_list<float> data) {
+    QuantizeAndPopulate<uint8_t>(input_, data);
+  }
+
+  void SetFilter(std::initializer_list<float> data) {
+    QuantizeAndPopulate<uint8_t>(filter_, data);
+  }
+
+  void SetBias(std::initializer_list<float> data) {
+    QuantizeAndPopulate<int32_t>(bias_, data);
+  }
+
+  std::vector<uint8_t> GetOutput() { return ExtractVector<uint8_t>(output_); }
+  std::vector<float> GetDequantizedOutput() {
+    return Dequantize<uint8_t>(ExtractVector<uint8_t>(output_),
+                               GetScale(output_), GetZeroPoint(output_));
+  }
+};
+
+TEST_F(NnApiSignedQuantizationTest,
+       Conv2DUnsignedPerTensorMapsToUnsignedOnSdk29) {
+  QuantizedConvolutionOpModel m(nnapi_mock_->GetNnApi(),
+                                ops::builtin::Register_CONVOLUTION_REF(),
+                                {TensorType_UINT8, {2, 2, 4, 1}, -63.5, 64},
+                                {TensorType_UINT8, {3, 2, 2, 1}, -63.5, 64},
+                                {TensorType_UINT8, {}, -127, 128});
+  m.SetInput({
+      // First batch
+      1, 1, 1, 1,  // row = 1
+      2, 2, 2, 2,  // row = 2
+      // Second batch
+      1, 2, 3, 4,  // row = 1
+      1, 2, 3, 4,  // row = 2
+  });
+  m.SetFilter({
+      1, 2, 3, 4,    // first 2x2 filter
+      -1, 1, -1, 1,  // second 2x2 filter
+      -1, -1, 1, 1,  // third 2x2 filter
+  });
+  m.SetBias({1, 2, 3});
+
+  m.Invoke();
+  EXPECT_EQ(m.GetCompilationStatus(), kTfLiteOk);
+
+  ASSERT_EQ(tensors_count->size(), 3);
+  ASSERT_NE(tensors_count->find(ANEURALNETWORKS_TENSOR_QUANT8_ASYMM),
+            tensors_count->end());
+  ASSERT_NE(tensors_count->find(ANEURALNETWORKS_TENSOR_INT32),
+            tensors_count->end());
+  ASSERT_NE(tensors_count->find(ANEURALNETWORKS_INT32), tensors_count->end());
+
+  EXPECT_EQ(tensors_count->at(ANEURALNETWORKS_TENSOR_QUANT8_ASYMM),
+            3);  // input, filter, output
+  EXPECT_EQ(tensors_count->at(ANEURALNETWORKS_TENSOR_INT32), 1);  // bias
+  EXPECT_EQ(tensors_count->at(ANEURALNETWORKS_INT32),
+            4);  //  padding, stride_width, stride_height, activation
+}
+
+TEST_F(NnApiSignedQuantizationTest,
+       Conv2dUnsignedPerTensorMapsToUnsignedOnSdk30) {
+  nnapi_mock_->SetAndroidSdkVersion(30);
+  QuantizedConvolutionOpModel m(nnapi_mock_->GetNnApi(),
+                                ops::builtin::Register_CONVOLUTION_REF(),
+                                {TensorType_UINT8, {2, 2, 4, 1}, -63.5, 64},
+                                {TensorType_UINT8, {3, 2, 2, 1}, -63.5, 64},
+                                {TensorType_UINT8, {}, -127, 128});
+  m.SetInput({
+      // First batch
+      1, 1, 1, 1,  // row = 1
+      2, 2, 2, 2,  // row = 2
+      // Second batch
+      1, 2, 3, 4,  // row = 1
+      1, 2, 3, 4,  // row = 2
+  });
+  m.SetFilter({
+      1, 2, 3, 4,    // first 2x2 filter
+      -1, 1, -1, 1,  // second 2x2 filter
+      -1, -1, 1, 1,  // third 2x2 filter
+  });
+  m.SetBias({1, 2, 3});
+
+  m.Invoke();
+  EXPECT_EQ(m.GetCompilationStatus(), kTfLiteOk);
+
+  ASSERT_EQ(tensors_count->size(), 3);
+  ASSERT_NE(tensors_count->find(ANEURALNETWORKS_TENSOR_QUANT8_ASYMM),
+            tensors_count->end());
+  ASSERT_NE(tensors_count->find(ANEURALNETWORKS_TENSOR_INT32),
+            tensors_count->end());
+  ASSERT_NE(tensors_count->find(ANEURALNETWORKS_INT32), tensors_count->end());
+
+  EXPECT_EQ(tensors_count->at(ANEURALNETWORKS_TENSOR_QUANT8_ASYMM),
+            3);  // input, filter, output
+  EXPECT_EQ(tensors_count->at(ANEURALNETWORKS_TENSOR_INT32), 1);  // bias
+  EXPECT_EQ(tensors_count->at(ANEURALNETWORKS_INT32),
+            4);  //  padding, stride_width, stride_height, activation
+}
+
+class PerChannelQuantizedConvolutionOpModel
+    : public BaseConvolutionOpModel<int8_t> {
+ public:
+  using BaseConvolutionOpModel::BaseConvolutionOpModel;
+
+  void SetInput(std::initializer_list<float> data) {
+    QuantizeAndPopulate<int8_t>(input_, data);
+  }
+
+  void SetFilter(std::initializer_list<float> data) {
+    PerChannelSymmetricQuantizeAndPopulate(filter_, data);
+  }
+
+  void SetBias(std::initializer_list<float> data) {
+    PerChannelQuantizeBias(bias_, data);
+  }
+
+  std::vector<int8_t> GetOutput() { return ExtractVector<int8_t>(output_); }
+  std::vector<float> GetDequantizedOutput() {
+    return Dequantize<int8_t>(ExtractVector<int8_t>(output_), GetScale(output_),
+                              GetZeroPoint(output_));
+  }
+};
+
+TEST_F(NnApiSignedQuantizationTest,
+       Conv2dSignedPerTensorMapsToUnsignedOnSdk29) {
+  nnapi_mock_->SetAndroidSdkVersion(29);
+  PerChannelQuantizedConvolutionOpModel m(
+      nnapi_mock_->GetNnApi(), ops::builtin::Register_CONVOLUTION_REF(),
+      {TensorType_INT8, {1, 2, 3, 2}, -63.5, 64, 0.5, -1},
+      {TensorType_INT8,
+       // [2 * 2 * 2 * 2] as [output_channel, y, x, input_channel]
+       {2, 2, 2, 2},
+       0,
+       0,
+       0,
+       0,
+       /*per_channel_quantization=*/true,
+       /*per_channel_quantization_scales=*/{1},
+       /*per_channel_quantization_offsets=*/{0},
+       /*channel_index=*/0},
+      {TensorType_INT8, {}, -63.5, 64, 0.5, -1},
+      /*stride_width=*/1, /*stride_height=*/1);
+  m.SetInput({
+      // [1 * 2 * 3 * 2] as [batch, y, x, input_channel]
+      3, 2,    // batch = 0, y = 0, x = 0
+      1, -1,   // batch = 0, y = 0, x = 1
+      -2, -3,  // batch = 0, y = 0, x = 2
+      4, 3,    // batch = 0, y = 1, x = 0
+      2, -2,   // batch = 0, y = 1, x = 1
+      -3, -4,  // batch = 0, y = 1, x = 2
+  });
+  m.SetFilter(
+      // [2 * 2 * 2 * 2] as [output_channel, y, x, input_channel]
+      {
+          1, 2,  // out channel = 0, y = 0, x = 0
+          3, 4,  // out channel = 0, y = 0, x = 1
+          3, 4,  // out channel = 0, y = 1, x = 0
+          5, 6,  // out channel = 0, y = 1, x = 1
+          7, 8,  // out channel = 1, y = 0, x = 0
+          5, 6,  // out channel = 1, y = 0, x = 1
+          3, 4,  // out channel = 1, y = 1, x = 0
+          1, 2,  // out channel = 1, y = 1, x = 1
+      });
+  m.SetBias({3, -2});
+
+  // Invoke and verify output.
+  // output has dimension [1 * 1 * 2 * 2] as [batch, y, x, output_channel]
+  m.Invoke();
+  EXPECT_EQ(m.GetCompilationStatus(), kTfLiteOk);
+
+  ASSERT_EQ(tensors_count->size(), 3);
+  ASSERT_NE(tensors_count->find(ANEURALNETWORKS_TENSOR_QUANT8_ASYMM),
+            tensors_count->end());
+  ASSERT_NE(tensors_count->find(ANEURALNETWORKS_TENSOR_INT32),
+            tensors_count->end());
+  ASSERT_NE(tensors_count->find(ANEURALNETWORKS_INT32), tensors_count->end());
+
+  EXPECT_EQ(tensors_count->at(ANEURALNETWORKS_TENSOR_QUANT8_ASYMM),
+            3);  // input, filter, output
+  EXPECT_EQ(tensors_count->at(ANEURALNETWORKS_TENSOR_INT32), 1);  // bias
+  EXPECT_EQ(tensors_count->at(ANEURALNETWORKS_INT32),
+            4);  //  padding, stride_width, stride_height, activation
+}
+
+TEST_F(NnApiSignedQuantizationTest,
+       Conv2dSignedPerTensorMapsToUnsignedOnSdk30) {
+  nnapi_mock_->SetAndroidSdkVersion(30);
+  PerChannelQuantizedConvolutionOpModel m(
+      nnapi_mock_->GetNnApi(), ops::builtin::Register_CONVOLUTION_REF(),
+      {TensorType_INT8, {1, 2, 3, 2}, -63.5, 64, 0.5, -1},
+      {TensorType_INT8,
+       // [2 * 2 * 2 * 2] as [output_channel, y, x, input_channel]
+       {2, 2, 2, 2},
+       0,
+       0,
+       0,
+       0,
+       /*per_channel_quantization=*/true,
+       /*per_channel_quantization_scales=*/{1},
+       /*per_channel_quantization_offsets=*/{0},
+       /*channel_index=*/0},
+      {TensorType_INT8, {}, -63.5, 64, 0.5, -1},
+      /*stride_width=*/1, /*stride_height=*/1);
+  m.SetInput({
+      // [1 * 2 * 3 * 2] as [batch, y, x, input_channel]
+      3, 2,    // batch = 0, y = 0, x = 0
+      1, -1,   // batch = 0, y = 0, x = 1
+      -2, -3,  // batch = 0, y = 0, x = 2
+      4, 3,    // batch = 0, y = 1, x = 0
+      2, -2,   // batch = 0, y = 1, x = 1
+      -3, -4,  // batch = 0, y = 1, x = 2
+  });
+  m.SetFilter(
+      // [2 * 2 * 2 * 2] as [output_channel, y, x, input_channel]
+      {
+          1, 2,  // out channel = 0, y = 0, x = 0
+          3, 4,  // out channel = 0, y = 0, x = 1
+          3, 4,  // out channel = 0, y = 1, x = 0
+          5, 6,  // out channel = 0, y = 1, x = 1
+          7, 8,  // out channel = 1, y = 0, x = 0
+          5, 6,  // out channel = 1, y = 0, x = 1
+          3, 4,  // out channel = 1, y = 1, x = 0
+          1, 2,  // out channel = 1, y = 1, x = 1
+      });
+  m.SetBias({3, -2});
+
+  // Invoke and verify output.
+  // output has dimension [1 * 1 * 2 * 2] as [batch, y, x, output_channel]
+  m.Invoke();
+  EXPECT_EQ(m.GetCompilationStatus(), kTfLiteOk);
+
+  ASSERT_EQ(tensors_count->size(), 3);
+  ASSERT_NE(tensors_count->find(ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED),
+            tensors_count->end());
+  ASSERT_NE(tensors_count->find(ANEURALNETWORKS_TENSOR_INT32),
+            tensors_count->end());
+  ASSERT_NE(tensors_count->find(ANEURALNETWORKS_INT32), tensors_count->end());
+
+  EXPECT_EQ(tensors_count->at(ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED),
+            3);  // input, filter, output
+  EXPECT_EQ(tensors_count->at(ANEURALNETWORKS_TENSOR_INT32), 1);  // bias
+  EXPECT_EQ(tensors_count->at(ANEURALNETWORKS_INT32),
+            4);  //  padding, stride_width, stride_height, activation
+}
+
+TEST_F(NnApiSignedQuantizationTest,
+       Conv2dSignedPerChannelMapsToUnsignedOnSdk29) {
+  PerChannelQuantizedConvolutionOpModel m(
+      nnapi_mock_->GetNnApi(), ops::builtin::Register_CONVOLUTION_REF(),
+      {TensorType_INT8, {1, 2, 3, 2}, -63.5, 64, 0.5, -1},
+      {TensorType_INT8,
+       // [2 * 2 * 2 * 2] as [output_channel, y, x, input_channel]
+       {2, 2, 2, 2},
+       0,
+       0,
+       0,
+       0,
+       /*per_channel_quantization=*/true,
+       /*per_channel_quantization_scales=*/{1, 2},
+       /*per_channel_quantization_offsets=*/{0, 0},
+       /*channel_index=*/0},
+      {TensorType_INT8, {}, -63.5, 64, 0.5, -1},
+      /*stride_width=*/1, /*stride_height=*/1);
+  m.SetInput({
+      // [1 * 2 * 3 * 2] as [batch, y, x, input_channel]
+      3, 2,    // batch = 0, y = 0, x = 0
+      1, -1,   // batch = 0, y = 0, x = 1
+      -2, -3,  // batch = 0, y = 0, x = 2
+      4, 3,    // batch = 0, y = 1, x = 0
+      2, -2,   // batch = 0, y = 1, x = 1
+      -3, -4,  // batch = 0, y = 1, x = 2
+  });
+  m.SetFilter(
+      // [2 * 2 * 2 * 2] as [output_channel, y, x, input_channel]
+      {
+          1, 2,  // out channel = 0, y = 0, x = 0
+          3, 4,  // out channel = 0, y = 0, x = 1
+          3, 4,  // out channel = 0, y = 1, x = 0
+          5, 6,  // out channel = 0, y = 1, x = 1
+          7, 8,  // out channel = 1, y = 0, x = 0
+          5, 6,  // out channel = 1, y = 0, x = 1
+          3, 4,  // out channel = 1, y = 1, x = 0
+          1, 2,  // out channel = 1, y = 1, x = 1
+      });
+  m.SetBias({3, -2});
+
+  // Invoke and verify output.
+  // output has dimension [1 * 1 * 2 * 2] as [batch, y, x, output_channel]
+  m.Invoke();
+  EXPECT_EQ(m.GetCompilationStatus(), kTfLiteOk);
+
+  ASSERT_EQ(tensors_count->size(), 4);
+  ASSERT_NE(tensors_count->find(ANEURALNETWORKS_TENSOR_QUANT8_ASYMM),
+            tensors_count->end());
+  ASSERT_NE(tensors_count->find(ANEURALNETWORKS_TENSOR_QUANT8_SYMM_PER_CHANNEL),
+            tensors_count->end());
+  ASSERT_NE(tensors_count->find(ANEURALNETWORKS_TENSOR_INT32),
+            tensors_count->end());
+  ASSERT_NE(tensors_count->find(ANEURALNETWORKS_INT32), tensors_count->end());
+
+  EXPECT_EQ(tensors_count->at(ANEURALNETWORKS_TENSOR_QUANT8_ASYMM),
+            2);  // input, output
+  EXPECT_EQ(tensors_count->at(ANEURALNETWORKS_TENSOR_QUANT8_SYMM_PER_CHANNEL),
+            1);                                                   // filter
+  EXPECT_EQ(tensors_count->at(ANEURALNETWORKS_TENSOR_INT32), 1);  // bias
+  EXPECT_EQ(tensors_count->at(ANEURALNETWORKS_INT32),
+            4);  //  padding, stride_width, stride_height, activation
+}
+
+TEST_F(NnApiSignedQuantizationTest, Conv2dSignedPerChannelMapsToSignedOnSdk30) {
+  nnapi_mock_->SetAndroidSdkVersion(30);
+  PerChannelQuantizedConvolutionOpModel m(
+      nnapi_mock_->GetNnApi(), ops::builtin::Register_CONVOLUTION_REF(),
+      {TensorType_INT8, {1, 2, 3, 2}, -63.5, 64, 0.5, -1},
+      {TensorType_INT8,
+       // [2 * 2 * 2 * 2] as [output_channel, y, x, input_channel]
+       {2, 2, 2, 2},
+       0,
+       0,
+       0,
+       0,
+       /*per_channel_quantization=*/true,
+       /*per_channel_quantization_scales=*/{1, 2},
+       /*per_channel_quantization_offsets=*/{0, 0},
+       /*channel_index=*/0},
+      {TensorType_INT8, {}, -63.5, 64, 0.5, -1},
+      /*stride_width=*/1, /*stride_height=*/1);
+  m.SetInput({
+      // [1 * 2 * 3 * 2] as [batch, y, x, input_channel]
+      3, 2,    // batch = 0, y = 0, x = 0
+      1, -1,   // batch = 0, y = 0, x = 1
+      -2, -3,  // batch = 0, y = 0, x = 2
+      4, 3,    // batch = 0, y = 1, x = 0
+      2, -2,   // batch = 0, y = 1, x = 1
+      -3, -4,  // batch = 0, y = 1, x = 2
+  });
+  m.SetFilter(
+      // [2 * 2 * 2 * 2] as [output_channel, y, x, input_channel]
+      {
+          1, 2,  // out channel = 0, y = 0, x = 0
+          3, 4,  // out channel = 0, y = 0, x = 1
+          3, 4,  // out channel = 0, y = 1, x = 0
+          5, 6,  // out channel = 0, y = 1, x = 1
+          7, 8,  // out channel = 1, y = 0, x = 0
+          5, 6,  // out channel = 1, y = 0, x = 1
+          3, 4,  // out channel = 1, y = 1, x = 0
+          1, 2,  // out channel = 1, y = 1, x = 1
+      });
+  m.SetBias({3, -2});
+
+  // Invoke and verify output.
+  // output has dimension [1 * 1 * 2 * 2] as [batch, y, x, output_channel]
+  m.Invoke();
+  EXPECT_EQ(m.GetCompilationStatus(), kTfLiteOk);
+
+  ASSERT_EQ(tensors_count->size(), 4);
+  ASSERT_NE(tensors_count->find(ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED),
+            tensors_count->end());
+  ASSERT_NE(tensors_count->find(ANEURALNETWORKS_TENSOR_QUANT8_SYMM_PER_CHANNEL),
+            tensors_count->end());
+  ASSERT_NE(tensors_count->find(ANEURALNETWORKS_TENSOR_INT32),
+            tensors_count->end());
+  ASSERT_NE(tensors_count->find(ANEURALNETWORKS_INT32), tensors_count->end());
+
+  EXPECT_EQ(tensors_count->at(ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED),
+            2);  // input, output
+  EXPECT_EQ(tensors_count->at(ANEURALNETWORKS_TENSOR_QUANT8_SYMM_PER_CHANNEL),
+            1);                                                   // filter
+  EXPECT_EQ(tensors_count->at(ANEURALNETWORKS_TENSOR_INT32), 1);  // bias
+  EXPECT_EQ(tensors_count->at(ANEURALNETWORKS_INT32),
+            4);  //  padding, stride_width, stride_height, activation
+}
+
+class QuantizeOpModel : public SingleOpModelWithNNAPI {
+ public:
+  QuantizeOpModel(const NnApi* nnapi, const TensorData& input,
+                  const TensorData& output) {
+    SingleOpModelWithNNAPI::Init(nnapi);
+    input_ = AddInput(input);
+    output_ = AddOutput(output);
+    SetBuiltinOp(BuiltinOperator_QUANTIZE, BuiltinOptions_QuantizeOptions,
+                 CreateQuantizeOptions(builder_).Union());
+
+    BuildInterpreter({GetShape(input_)}, /*num_threads=*/-1,
+                     /* allow_fp32_relax_to_fp16 */ false,
+                     /*apply_delegate=*/false);
+    compilation_status_ = ApplyDelegate();
+  }
+
+  void SetInput(std::initializer_list<float> data) {
+    PopulateTensor(input_, data);
+  }
+
+  template <typename T>
+  void SetInputAndQuantize(std::initializer_list<float> data) {
+    QuantizeAndPopulate<T>(input_, data);
+  }
+
+  template <typename T>
+  std::vector<T> GetOutput() {
+    return ExtractVector<T>(output_);
+  }
+
+ private:
+  int input_;
+  int output_;
+};
+
+TEST_F(NnApiSignedQuantizationTest, QuantizeUint8MapsToUint8OnSdk29) {
+  // [-63.5, 64] -> scale=0.5 zero_point=127 for UINT8
+  QuantizeOpModel m(nnapi_mock_->GetNnApi(), {TensorType_FLOAT32, {2, 5}},
+                    {TensorType_UINT8, {2, 5}, 0, 0, 0.5, 127});
+
+  m.SetInput({-63.5, -63, -62.5, -62, -61.5, 62, 62.5, 63, 63.5, 64});
+  m.Invoke();
+  EXPECT_EQ(m.GetCompilationStatus(), kTfLiteOk);
+
+  ASSERT_EQ(tensors_count->size(), 2);
+  ASSERT_NE(tensors_count->find(ANEURALNETWORKS_TENSOR_FLOAT32),
+            tensors_count->end());
+  ASSERT_NE(tensors_count->find(ANEURALNETWORKS_TENSOR_QUANT8_ASYMM),
+            tensors_count->end());
+
+  EXPECT_EQ(tensors_count->at(ANEURALNETWORKS_TENSOR_FLOAT32),
+            1);  // input
+  EXPECT_EQ(tensors_count->at(ANEURALNETWORKS_TENSOR_QUANT8_ASYMM),
+            1);  // output
+}
+
+TEST_F(NnApiSignedQuantizationTest, QuantizeUint8MapsToUint8OnSdk30) {
+  nnapi_mock_->SetAndroidSdkVersion(30);
+  // [-63.5, 64] -> scale=0.5 zero_point=127 for UINT8
+  QuantizeOpModel m(nnapi_mock_->GetNnApi(), {TensorType_FLOAT32, {2, 5}},
+                    {TensorType_UINT8, {2, 5}, 0, 0, 0.5, 127});
+
+  m.SetInput({-63.5, -63, -62.5, -62, -61.5, 62, 62.5, 63, 63.5, 64});
+  m.Invoke();
+  EXPECT_EQ(m.GetCompilationStatus(), kTfLiteOk);
+
+  ASSERT_EQ(tensors_count->size(), 2);
+  ASSERT_NE(tensors_count->find(ANEURALNETWORKS_TENSOR_FLOAT32),
+            tensors_count->end());
+  ASSERT_NE(tensors_count->find(ANEURALNETWORKS_TENSOR_QUANT8_ASYMM),
+            tensors_count->end());
+
+  EXPECT_EQ(tensors_count->at(ANEURALNETWORKS_TENSOR_FLOAT32),
+            1);  // input
+  EXPECT_EQ(tensors_count->at(ANEURALNETWORKS_TENSOR_QUANT8_ASYMM),
+            1);  // output
+}
+
+// Quantize with Int8 output is only supported since SDK level 30.
+TEST_F(NnApiSignedQuantizationTest, QuantizeInt8MapsToInt8OnSdk30) {
+  nnapi_mock_->SetAndroidSdkVersion(30);
+  // [-63.5, 64] -> scale=0.5 zero_point=1 for INT8
+  QuantizeOpModel m(nnapi_mock_->GetNnApi(), {TensorType_FLOAT32, {2, 5}},
+                    {TensorType_INT8, {2, 5}, 0, 0, 0.5, -1});
+
+  m.SetInput({-63.5, -63, -62.5, -62, -61.5, 62, 62.5, 63, 63.5, 64});
+  m.Invoke();
+  EXPECT_EQ(m.GetCompilationStatus(), kTfLiteOk);
+
+  ASSERT_EQ(tensors_count->size(), 2);
+  ASSERT_NE(tensors_count->find(ANEURALNETWORKS_TENSOR_FLOAT32),
+            tensors_count->end());
+  ASSERT_NE(tensors_count->find(ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED),
+            tensors_count->end());
+
+  EXPECT_EQ(tensors_count->at(ANEURALNETWORKS_TENSOR_FLOAT32),
+            1);  // input
+  EXPECT_EQ(tensors_count->at(ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED),
+            1);  // output
+}
+
+class DequantizeOpModel : public SingleOpModelWithNNAPI {
+ public:
+  DequantizeOpModel(const NnApi* nnapi, TensorType type,
+                    std::initializer_list<int> shape, float scale,
+                    int32_t zero_point, int version) {
+    SingleOpModelWithNNAPI::Init(nnapi);
+    const TensorData input_tensor_data = {type, shape, 0, 0, scale, zero_point};
+    input_ = AddInput(input_tensor_data);
+    output_ = AddOutput({TensorType_FLOAT32, shape});
+    SetBuiltinOp(BuiltinOperator_DEQUANTIZE, BuiltinOptions_DequantizeOptions,
+                 CreateDequantizeOptions(builder_).Union());
+
+    resolver_ = absl::make_unique<SingleOpResolver>(
+        BuiltinOperator_DEQUANTIZE, ops::builtin::Register_DEQUANTIZE(),
+        version);
+
+    BuildInterpreter({GetShape(input_)}, /*num_threads=*/-1,
+                     /* allow_fp32_relax_to_fp16 */ false,
+                     /*apply_delegate=*/false);
+    compilation_status_ = ApplyDelegate();
+  }
+
+  template <typename T>
+  void SetInput(std::initializer_list<T> data) {
+    PopulateTensor(input_, data);
+  }
+
+  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+
+ private:
+  int input_;
+  int output_;
+};
+
+TEST_F(NnApiSignedQuantizationTest, DequantizeUint8MapsToUint8OnSdk29) {
+  // [-63.5, 64] -> scale=0.5 zero_point=127 for UINT8
+  DequantizeOpModel m(nnapi_mock_->GetNnApi(), TensorType_UINT8, {2, 5}, 0.5,
+                      127, 1);
+
+  m.SetInput<uint8_t>({0, 1, 2, 3, 4, 251, 252, 253, 254, 255});
+  m.Invoke();
+  EXPECT_EQ(m.GetCompilationStatus(), kTfLiteOk);
+
+  ASSERT_EQ(tensors_count->size(), 2);
+  ASSERT_NE(tensors_count->find(ANEURALNETWORKS_TENSOR_QUANT8_ASYMM),
+            tensors_count->end());
+  ASSERT_NE(tensors_count->find(ANEURALNETWORKS_TENSOR_FLOAT32),
+            tensors_count->end());
+
+  EXPECT_EQ(tensors_count->at(ANEURALNETWORKS_TENSOR_QUANT8_ASYMM),
+            1);  // input
+  EXPECT_EQ(tensors_count->at(ANEURALNETWORKS_TENSOR_FLOAT32),
+            1);  // output
+}
+
+TEST_F(NnApiSignedQuantizationTest, DequantizeUint8MapsToUint8OnSdk30) {
+  nnapi_mock_->SetAndroidSdkVersion(30);
+  // [-63.5, 64] -> scale=0.5 zero_point=127 for UINT8
+  DequantizeOpModel m(nnapi_mock_->GetNnApi(), TensorType_UINT8, {2, 5}, 0.5,
+                      127, 1);
+
+  m.SetInput<uint8_t>({0, 1, 2, 3, 4, 251, 252, 253, 254, 255});
+  m.Invoke();
+  EXPECT_EQ(m.GetCompilationStatus(), kTfLiteOk);
+
+  ASSERT_EQ(tensors_count->size(), 2);
+  ASSERT_NE(tensors_count->find(ANEURALNETWORKS_TENSOR_QUANT8_ASYMM),
+            tensors_count->end());
+  ASSERT_NE(tensors_count->find(ANEURALNETWORKS_TENSOR_FLOAT32),
+            tensors_count->end());
+
+  EXPECT_EQ(tensors_count->at(ANEURALNETWORKS_TENSOR_QUANT8_ASYMM),
+            1);  // input
+  EXPECT_EQ(tensors_count->at(ANEURALNETWORKS_TENSOR_FLOAT32),
+            1);  // output
+}
+
+// Dequantize with Int8 input is only supported for symmetric quantization on
+// SDK level 29
+TEST_F(NnApiSignedQuantizationTest,
+       DequantizeTestInt8SymmMapsToInt8SymmOnSdk29) {
+  // [-63.5, 64] -> scale=0.5, zero_point=0 for INT8
+  DequantizeOpModel m(nnapi_mock_->GetNnApi(), TensorType_INT8, {2, 5}, 0.5, 0,
+                      2);
+
+  m.SetInput<int8_t>({-128, -127, -126, -125, -124, 123, 124, 125, 126, 127});
+  m.Invoke();
+  EXPECT_EQ(m.GetCompilationStatus(), kTfLiteOk);
+
+  ASSERT_EQ(tensors_count->size(), 2);
+  ASSERT_NE(tensors_count->find(ANEURALNETWORKS_TENSOR_QUANT8_SYMM),
+            tensors_count->end());
+  ASSERT_NE(tensors_count->find(ANEURALNETWORKS_TENSOR_FLOAT32),
+            tensors_count->end());
+
+  EXPECT_EQ(tensors_count->at(ANEURALNETWORKS_TENSOR_QUANT8_SYMM),
+            1);  // input
+  EXPECT_EQ(tensors_count->at(ANEURALNETWORKS_TENSOR_FLOAT32),
+            1);  // output
+}
+
+// Dequantize with Int8 input is only supported since SDK level 30.
+TEST_F(NnApiSignedQuantizationTest, DequantizeTestInt8MapsToInt8OnSdk30) {
+  nnapi_mock_->SetAndroidSdkVersion(30);
+  // [-63.5, 64] -> scale=0.5, zero_point=1 for INT8
+  DequantizeOpModel m(nnapi_mock_->GetNnApi(), TensorType_INT8, {2, 5}, 0.5, -1,
+                      2);
+
+  m.SetInput<int8_t>({-128, -127, -126, -125, -124, 123, 124, 125, 126, 127});
+  m.Invoke();
+  EXPECT_EQ(m.GetCompilationStatus(), kTfLiteOk);
+
+  ASSERT_EQ(tensors_count->size(), 2);
+  ASSERT_NE(tensors_count->find(ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED),
+            tensors_count->end());
+  ASSERT_NE(tensors_count->find(ANEURALNETWORKS_TENSOR_FLOAT32),
+            tensors_count->end());
+
+  EXPECT_EQ(tensors_count->at(ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED),
+            1);  // input
+  EXPECT_EQ(tensors_count->at(ANEURALNETWORKS_TENSOR_FLOAT32),
+            1);  // output
+}
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/lite/delegates/xnnpack/BUILD b/tensorflow/lite/delegates/xnnpack/BUILD
index 5736a2995b1..eaf7d8f6f03 100644
--- a/tensorflow/lite/delegates/xnnpack/BUILD
+++ b/tensorflow/lite/delegates/xnnpack/BUILD
@@ -14,6 +14,10 @@ EMSCRIPTEN_LINKOPTS = [
     "-s TOTAL_MEMORY=134217728",
 ]
 
+exports_files([
+    "xnnpack_delegate.h",
+])
+
 cc_library(
     name = "xnnpack_delegate",
     srcs = ["xnnpack_delegate.cc"],
@@ -21,6 +25,7 @@ cc_library(
     linkstatic = True,
     deps = [
         "//tensorflow/lite:kernel_api",
+        "//tensorflow/lite:minimal_logging",
         "//tensorflow/lite:util",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/schema:schema_fbs",
@@ -47,6 +52,7 @@ cc_library(
     linkstatic = True,
     deps = [
         "//tensorflow/lite:kernel_api",
+        "//tensorflow/lite:minimal_logging",
         "//tensorflow/lite:util",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/schema:schema_fbs",
diff --git a/tensorflow/lite/delegates/xnnpack/README.md b/tensorflow/lite/delegates/xnnpack/README.md
index 97d2d5565db..d94e92c7306 100644
--- a/tensorflow/lite/delegates/xnnpack/README.md
+++ b/tensorflow/lite/delegates/xnnpack/README.md
@@ -238,6 +238,43 @@ Below is the list of current operators and limitations:
 * Fused `NONE`, `RELU`, `RELU_N1_TO_1`, and `RELU6` activations are supported,
   but fused `TANH` and `SIGN_BIT` activations are not.
 
+### Sparse Inference (experimental)
+
+XNNPACK backend supports sparse inference for CNN models described in the
+[Fast Sparse ConvNets](https://arxiv.org/abs/1911.09723) paper. This
+functionality must be enabled at build-time via
+`--define xnn_enable_sparse=true` Bazel flag. Sparse inference is restricted
+to subgraphs with the following operators:
+
+* Sparse subgraph must start with a 3x3 stride-2 `CONV_2D` operator with
+  padding 1 on each side, no dilation, and 3 input channels.
+* Sparse subgraph must end with a `MEAN` operator that does reduction across
+  spatial axes.
+* Sparse subgraph may contain the following operators:
+  * `CONV_2D` with 1x1 kernel and no padding. It is important to have high
+    sparsity (at least 70%) in the filter of this operator to get speedup
+    over dense inference.
+  * `DEPTHWISE_CONV_2D` with 3x3 kernel, stride 1, no dilation, and padding 1
+    on each side.
+  * `DEPTHWISE_CONV_2D` with 3x3 kernel, stride 2, no dilation, and padding 1
+    on each side.
+  * `DEPTHWISE_CONV_2D` with 5x5 kernel, stride 1, no dilation, and padding 2
+    on each side.
+  * `DEPTHWISE_CONV_2D` with 5x5 kernel, stride 2, no dilation, and padding 2
+    on each side.
+  * `ADD` and `MUL` operators where both inputs are 4D tensors. If one of the
+    inputs to `ADD` or `MUL` is a constant tensor, it must be representable as
+    either a scalar, or a 1D vector.
+  * Unary elementwise operators `ABS`, `CEIL`, `FLOOR`, `HARD_SWISH`,
+    `LEAKY_RELU`, `LOGISTIC`, `NEG`, `RELU`, `RELU6`, `RELU_N1_TO_1`, `ROUND`,
+    and `SQUARE`.
+
+Pre-trained [Fast Sparse ConvNets models](https://github.com/google-research/google-research/tree/master/fastconvnets)
+provide examples that satisfy these constrains.
+
+In addition to acceleration, sparse models get the compression benefit by
+storing only non-zero values in the [TensorFlow Lite file format](https://github.com/tensorflow/tensorflow/blob/4aea552e064cf92330e07e83a3b5a1ca2a7034d0/tensorflow/lite/schema/schema.fbs#L84-L109).
+
 ### Other limitations
 
 * Dynamically allocated (with `kTfLiteDynamic` allocation type) inputs and
diff --git a/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc b/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc
index c4c95b6b295..0afc9c32122 100644
--- a/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc
+++ b/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc
@@ -32,6 +32,7 @@ limitations under the License.
 #include "tensorflow/lite/builtin_ops.h"
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/minimal_logging.h"
 #include "tensorflow/lite/tools/optimize/sparsity/format_converter.h"
 
 namespace tflite {
@@ -52,6 +53,8 @@ class Delegate {
           pthreadpool_create(static_cast<size_t>(options->num_threads)));
     }
 #endif
+    TFLITE_LOG_PROD_ONCE(tflite::TFLITE_LOG_INFO,
+                         "Created TensorFlow Lite XNNPACK delegate for CPU.");
   }
 
   TfLiteIntArray* PrepareOpsToDelegate(TfLiteContext* context);
@@ -327,7 +330,7 @@ class Subgraph {
         *output_min = 0.0f;
         *output_max = +std::numeric_limits<float>::infinity();
         return kTfLiteOk;
-      case kTfLiteActRelu1:
+      case kTfLiteActReluN1To1:
         *output_min = -1.0f;
         *output_max = +1.0f;
         return kTfLiteOk;
@@ -494,7 +497,7 @@ class Subgraph {
             context, "unsupported fused activation (Relu) in node #%d",
             node_index);
         return kTfLiteOk;
-      case kTfLiteActRelu1:
+      case kTfLiteActReluN1To1:
         TF_LITE_MAYBE_KERNEL_LOG(
             context, "unsupported fused activation (ReluMinus1To1) in node #%d",
             node_index);
diff --git a/tensorflow/lite/examples/label_image/BUILD b/tensorflow/lite/examples/label_image/BUILD
index 01296b0b2a0..633f767c5e9 100644
--- a/tensorflow/lite/examples/label_image/BUILD
+++ b/tensorflow/lite/examples/label_image/BUILD
@@ -38,6 +38,7 @@ cc_binary(
         "//tensorflow/lite/profiling:profiler",
         "//tensorflow/lite/tools/evaluation:utils",
         "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
     ] + select({
         "//tensorflow:android": [
             "//tensorflow/lite/delegates/gpu:delegate",
diff --git a/tensorflow/lite/experimental/delegates/coreml/builders/activation_layer_builder.cc b/tensorflow/lite/experimental/delegates/coreml/builders/activation_layer_builder.cc
index ec032d8421e..df853797c8a 100644
--- a/tensorflow/lite/experimental/delegates/coreml/builders/activation_layer_builder.cc
+++ b/tensorflow/lite/experimental/delegates/coreml/builders/activation_layer_builder.cc
@@ -41,7 +41,7 @@ CoreML::Specification::NeuralNetworkLayer* ActivationLayerBuilder::Build() {
       layer_->mutable_activation()->mutable_relu();
       break;
     // Relu1 and Relu6 layers are fully composed in PopulateSubgraph().
-    case kTfLiteActRelu1:  // clip(-1, 1)
+    case kTfLiteActReluN1To1:  // clip(-1, 1)
       layer_->mutable_unary()->set_alpha(-1);
       layer_->mutable_unary()->set_type(
           CoreML::Specification::UnaryFunctionLayerParams::THRESHOLD);
@@ -64,7 +64,7 @@ CoreML::Specification::NeuralNetworkLayer* ActivationLayerBuilder::Build() {
 }
 
 TfLiteStatus ActivationLayerBuilder::PopulateSubgraph(TfLiteContext* context) {
-  if (!(activation_ == kTfLiteActRelu6 || activation_ == kTfLiteActRelu1)) {
+  if (!(activation_ == kTfLiteActRelu6 || activation_ == kTfLiteActReluN1To1)) {
     builder_output_ = AddOutput();
     return kTfLiteOk;
   }
@@ -125,7 +125,7 @@ OpBuilder* CreateReluOpBuilder(GraphBuilder* graph_builder) {
 }
 
 OpBuilder* CreateReluN1To1OpBuilder(GraphBuilder* graph_builder) {
-  return new ActivationLayerBuilder(graph_builder, kTfLiteActRelu1);
+  return new ActivationLayerBuilder(graph_builder, kTfLiteActReluN1To1);
 }
 
 OpBuilder* CreateRelu6OpBuilder(GraphBuilder* graph_builder) {
diff --git a/tensorflow/lite/experimental/ios/BUILD.apple b/tensorflow/lite/experimental/ios/BUILD.apple
index aa41b9e2d62..7a40ca7b8e7 100644
--- a/tensorflow/lite/experimental/ios/BUILD.apple
+++ b/tensorflow/lite/experimental/ios/BUILD.apple
@@ -1,7 +1,7 @@
 # TensorFlow Lite for iOS
 
 load("@bazel_skylib//rules:build_test.bzl", "build_test")
-load("//tensorflow/lite/experimental/ios:ios.bzl", "TFL_MINIMUM_OS_VERSION")
+load("//tensorflow/lite/experimental/ios:ios.bzl", "TFL_MINIMUM_OS_VERSION", "tflite_ios_static_framework")
 load("@build_bazel_rules_apple//apple:ios.bzl", "ios_static_framework")
 
 package(
@@ -11,15 +11,39 @@ package(
     licenses = ["notice"],  # Apache 2.0
 )
 
+sh_binary(
+    name = "hide_symbols_with_whitelist",
+    srcs = [
+        "hide_symbols_with_whitelist.sh",
+    ],
+)
+
+# When the static framework is built with bazel, the all header files are moved
+# to the "Headers" directory with no header path prefixes. This auxiliary rule
+# is used for stripping the path prefix to the "common.h" file included by the
+# "xnnpack_delegate.h" header.
+genrule(
+    name = "strip_xnnpack_include_hdr",
+    srcs = ["//tensorflow/lite/delegates/xnnpack:xnnpack_delegate.h"],
+    outs = ["xnnpack_delegate.h"],
+    cmd = """
+    sed 's|#include ".*common.h"|#include "common.h"|'\
+    "$(location //tensorflow/lite/delegates/xnnpack:xnnpack_delegate.h)"\
+    > "$@"
+    """,
+)
+
 # bazel build -c opt --config=ios_fat //tensorflow/lite/experimental/ios:TensorFlowLiteC_framework
-ios_static_framework(
+tflite_ios_static_framework(
     name = "TensorFlowLiteC_framework",
     hdrs = [
+        ":xnnpack_delegate.h",
         "//tensorflow/lite/c:c_api.h",
         "//tensorflow/lite/c:common.h",
     ],
     bundle_name = "TensorFlowLiteC",
     minimum_os_version = TFL_MINIMUM_OS_VERSION,
+    whitelist_symbols_file = ":whitelist_TensorFlowLiteC.txt",
     deps = [
         ":tensorflow_lite_c",
     ],
@@ -60,16 +84,14 @@ genrule(
 # TensorFlowLiteC framework above in a composable way.
 #
 # bazel build -c opt --config=ios_fat //tensorflow/lite/experimental/ios:TensorFlowLiteCCoreMl_framework
-ios_static_framework(
+tflite_ios_static_framework(
     name = "TensorFlowLiteCCoreML_framework",
     hdrs = [
         ":coreml_delegate.h",
     ],
-    avoid_deps = [
-        ":tensorflow_lite_c",
-    ],
     bundle_name = "TensorFlowLiteCCoreML",
     minimum_os_version = TFL_MINIMUM_OS_VERSION,
+    whitelist_symbols_file = ":whitelist_TensorFlowLiteCCoreML.txt",
     deps = [
         "//tensorflow/lite/experimental/delegates/coreml:coreml_delegate",
     ],
@@ -81,16 +103,14 @@ ios_static_framework(
 # TensorFlowLiteC framework above in a composable way.
 #
 # bazel build -c opt --config=ios_fat //tensorflow/lite/experimental/ios:TensorFlowLiteCMetal_framework
-ios_static_framework(
+tflite_ios_static_framework(
     name = "TensorFlowLiteCMetal_framework",
     hdrs = [
         "//tensorflow/lite/delegates/gpu:metal_delegate.h",
     ],
-    avoid_deps = [
-        ":tensorflow_lite_c",
-    ],
     bundle_name = "TensorFlowLiteCMetal",
     minimum_os_version = TFL_MINIMUM_OS_VERSION,
+    whitelist_symbols_file = ":whitelist_TensorFlowLiteCMetal.txt",
     deps = [
         "//tensorflow/lite/delegates/gpu:metal_delegate",
     ],
@@ -101,6 +121,7 @@ cc_library(
     hdrs = [
         "//tensorflow/lite/c:c_api.h",
         "//tensorflow/lite/c:common.h",
+        "//tensorflow/lite/delegates/xnnpack:xnnpack_delegate.h",
     ],
     tags = [
         "nobuilder",
@@ -108,6 +129,7 @@ cc_library(
     ],
     deps = [
         "//tensorflow/lite/c:c_api",
+        "//tensorflow/lite/delegates/xnnpack:xnnpack_delegate",
     ],
 )
 
diff --git a/tensorflow/lite/experimental/ios/hide_symbols_with_whitelist.sh b/tensorflow/lite/experimental/ios/hide_symbols_with_whitelist.sh
new file mode 100755
index 00000000000..2fa0fc53c33
--- /dev/null
+++ b/tensorflow/lite/experimental/ios/hide_symbols_with_whitelist.sh
@@ -0,0 +1,135 @@
+#!/bin/bash
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+#
+# A script to merge Mach-O object files into a single object file and hide
+# their internal symbols. Only whitelisted symbols will be visible in the
+# symbol table after this script.
+
+# To run this script, you must set several variables:
+#   INPUT_FRAMEWORK: a zip file containing the iOS static framework.
+#   BUNDLE_NAME: the pod/bundle name of the iOS static framework.
+#   WHITELIST_FILE_PATH: contains the whitelisted symbols.
+#   OUTPUT: the output zip file.
+
+# Halt on any error or any unknown variable.
+set -ue
+
+LD_DEBUGGABLE_FLAGS="-x"
+# Uncomment the below to get debuggable output. This can only be done for one
+# library at a time.
+# LD_DEBUGGABLE_FLAGS="-d"
+
+# Exits if C++ symbols are found in the whitelist list.
+if grep -q "^__Z" "${WHITELIST_FILE_PATH}"
+then
+  echo "ERROR: Failed in symbol hiding. This rule does not permit hiding of" \
+       "C++ symbols due to possible serious problems mixing symbol hiding," \
+       "shared libraries and the C++ runtime." \
+       "More info can be found in go/ios-symbols-hiding." \
+       "Please recheck the whitelist list and remove C++ symbols:"
+  echo "$(grep "^__Z" "${WHITELIST_FILE_PATH}")"
+  exit 1 # terminate and indicate error
+fi
+# Unzips the framework zip file into a temp workspace.
+framework=$(mktemp -t framework -d)
+unzip "${INPUT_FRAMEWORK}" -d "${framework}"/
+
+# Executable file in the framework.
+executable_file="${BUNDLE_NAME}.framework/${BUNDLE_NAME}"
+
+# Extracts architectures from the framework binary.
+archs_str=$(xcrun lipo -info "${framework}/${executable_file}" |
+sed -En -e 's/^(Non-|Architectures in the )fat file: .+( is architecture| are): (.*)$/\3/p')
+
+IFS=' ' read -r -a archs <<< "${archs_str}"
+
+merge_cmd=(xcrun lipo)
+
+# Merges object files and hide symbols for each architecture.
+for arch in "${archs[@]}"
+do
+    archdir=$(mktemp -t "${arch}" -d)
+    arch_file="${archdir}/${arch}"
+
+    # Handles the binary differently if they are fat or thin.
+    if [[ "${#archs[@]}" -gt 1 ]]; then
+       xcrun lipo "${framework}/${executable_file}" -thin "${arch}" -output "${arch_file}"
+    else
+       mv "${framework}/${executable_file}" "${arch_file}"
+    fi
+    if [[ "$arch" == "armv7" ]]; then
+      # Check that there are no thread local variables in the input, as they get broken.
+      # See b/124533863.
+      thread_locals=$(xcrun nm -m -g "${arch_file}" | awk '/__DATA,__thread_vars/ { print $5 }' | c++filt)
+      if [[ -n "${thread_locals}" ]]; then
+         echo
+         echo "WARNING: This symbol hiding script breaks thread local variables on 32-bit arm, you had:"
+         echo "${thread_locals}"
+         echo
+         echo "Your build will crash if these variables are actually used at runtime."
+         echo
+      fi
+    fi
+    xcrun ar -x "${arch_file}"
+    mv *.o "${archdir}"/
+
+    objects_file_list=$(mktemp)
+    # Hides the symbols except the whitelisted ones.
+    find "${archdir}" -name "*.o" >> "${objects_file_list}"
+
+    # Checks whether bitcode is enabled in the framework.
+    all_objects_have_bitcode=true
+    for object_file in $(cat "$objects_file_list"); do
+      if otool -arch "${arch}" -l "${object_file}" | grep -q __LLVM; then
+        : # Do nothing
+      else
+        echo "The ${arch} in ${object_file} is NOT bitcode-enabled."
+        all_objects_have_bitcode=false
+        break
+      fi
+    done
+    if [[ "$all_objects_have_bitcode" = "true" ]]; then
+      echo "The ${arch} in ${executable_file} is fully bitcode-enabled."
+      xcrun ld -r -bitcode_bundle -exported_symbols_list \
+        "${WHITELIST_FILE_PATH}" \
+        $LD_DEBUGGABLE_FLAGS \
+        -filelist "${objects_file_list}" -o "${arch_file}_processed.o"
+    else
+      echo "The ${arch} in ${executable_file} is NOT fully bitcode-enabled."
+      xcrun ld -r -exported_symbols_list \
+        "${WHITELIST_FILE_PATH}" \
+        $LD_DEBUGGABLE_FLAGS \
+        -filelist "${objects_file_list}" -o "${arch_file}_processed.o"
+    fi
+
+    output_object="${framework}/${arch}"
+
+    mv "${arch_file}_processed.o" "${output_object}"
+    rm -rf "${archdir}"
+    rm "${objects_file_list}"
+    merge_cmd+=(-arch "${arch}" "${output_object}")
+done
+
+# Repackages the processed object files.
+unzip "${INPUT_FRAMEWORK}"
+merge_cmd+=(-create -output "${BUNDLE_NAME}")
+"${merge_cmd[@]}"
+
+chmod +x "${BUNDLE_NAME}"
+rm "${executable_file}"
+mv "${BUNDLE_NAME}" "${executable_file}"
+( TZ=UTC find "${BUNDLE_NAME}.framework/" -exec touch -h -t 198001010000 {} \+ )
+zip --compression-method store --symlinks --recurse-paths --quiet "${OUTPUT}" "${BUNDLE_NAME}.framework/"
diff --git a/tensorflow/lite/experimental/ios/ios.bzl b/tensorflow/lite/experimental/ios/ios.bzl
index 976c6b09a97..3181b587e72 100644
--- a/tensorflow/lite/experimental/ios/ios.bzl
+++ b/tensorflow/lite/experimental/ios/ios.bzl
@@ -1,5 +1,8 @@
 """TensorFlow Lite Build Configurations for iOS"""
 
+# Placeholder for Google-internal load statements.
+load("@build_bazel_rules_apple//apple:ios.bzl", "ios_static_framework")
+
 TFL_MINIMUM_OS_VERSION = "9.0"
 
 # Default tags for filtering iOS targets. Targets are restricted to Apple platforms.
@@ -13,3 +16,58 @@ TFL_DISABLED_SANITIZER_TAGS = [
     "nomsan",
     "notsan",
 ]
+
+# iOS static framework with symbol whitelist. Exported C++ symbbols might cause
+# symbol collision with other libraries. List of symbols to whitelist can be
+# generated by running `nm -m -g FRAMEWORK_LIBRARY | grep _TfLite` for framework
+# built with `ios_static_framework` rule.
+def tflite_ios_static_framework(
+        name,
+        bundle_name,
+        whitelist_symbols_file,
+        exclude_resources = True,
+        **kwargs):
+    """TFLite variant of ios_static_framework with symbol hiding.
+
+    Args:
+      name: The name of the target.
+      bundle_name: The name to give to the framework bundle, without the
+          ".framework" extension. If omitted, the target's name will be used.
+      whitelist_symbols_file: a file including a list of whitelisted symbols,
+          one symbol per line.
+      exclude_resources: Indicates whether resources should be excluded from the
+          bundle. This can be used to avoid unnecessarily bundling resources if
+          the static framework is being distributed in a different fashion, such
+          as a Cocoapod.
+      **kwargs: Pass-through arguments.
+    """
+
+    preprocessed_name = "Preprocessed_" + name
+    ios_static_framework(
+        name = preprocessed_name,
+        bundle_name = bundle_name,
+        exclude_resources = exclude_resources,
+        **kwargs
+    )
+
+    framework_target = ":{}.zip".format(preprocessed_name)
+
+    srcs = [
+        framework_target,
+        whitelist_symbols_file,
+    ]
+    cmd = ("INPUT_FRAMEWORK=\"$(location " + framework_target + ")\" " +
+           "BUNDLE_NAME=\"" + bundle_name + "\" " +
+           "WHITELIST_FILE_PATH=\"$(location " + whitelist_symbols_file + ")\" " +
+           "OUTPUT=\"$(OUTS)\" " +
+           "\"$(location //tensorflow/lite/experimental/ios:hide_symbols_with_whitelist)\"")
+
+    native.genrule(
+        name = name,
+        srcs = srcs,
+        outs = [name + ".zip"],
+        cmd = cmd,
+        tools = [
+            "//tensorflow/lite/experimental/ios:hide_symbols_with_whitelist",
+        ],
+    )
diff --git a/tensorflow/lite/experimental/ios/whitelist_TensorFlowLiteC.txt b/tensorflow/lite/experimental/ios/whitelist_TensorFlowLiteC.txt
new file mode 100644
index 00000000000..e8ae288ea8f
--- /dev/null
+++ b/tensorflow/lite/experimental/ios/whitelist_TensorFlowLiteC.txt
@@ -0,0 +1 @@
+_TfLite*
diff --git a/tensorflow/lite/experimental/ios/whitelist_TensorFlowLiteCCoreML.txt b/tensorflow/lite/experimental/ios/whitelist_TensorFlowLiteCCoreML.txt
new file mode 100644
index 00000000000..817b4a7f2ec
--- /dev/null
+++ b/tensorflow/lite/experimental/ios/whitelist_TensorFlowLiteCCoreML.txt
@@ -0,0 +1,2 @@
+_TfLiteCoreMlDelegateCreate
+_TfLiteCoreMlDelegateDelete
diff --git a/tensorflow/lite/experimental/ios/whitelist_TensorFlowLiteCMetal.txt b/tensorflow/lite/experimental/ios/whitelist_TensorFlowLiteCMetal.txt
new file mode 100644
index 00000000000..b66b059eef0
--- /dev/null
+++ b/tensorflow/lite/experimental/ios/whitelist_TensorFlowLiteCMetal.txt
@@ -0,0 +1,2 @@
+_TFLGpuDelegateCreate
+_TFLGpuDelegateDelete
diff --git a/tensorflow/lite/experimental/objc/BUILD.apple b/tensorflow/lite/experimental/objc/BUILD.apple
index ff7e8fa58e9..d26d90c46a1 100644
--- a/tensorflow/lite/experimental/objc/BUILD.apple
+++ b/tensorflow/lite/experimental/objc/BUILD.apple
@@ -64,6 +64,7 @@ objc_library(
     visibility = ios_visibility_whitelist(),
     deps = [
         "//tensorflow/lite/c:c_api",
+        "//tensorflow/lite/delegates/xnnpack:xnnpack_delegate",
     ],
     alwayslink = 1,
 )
@@ -97,7 +98,7 @@ objc_library(
         "//tensorflow/lite:testdata/add.bin",
         "//tensorflow/lite:testdata/add_quantized.bin",
     ],
-    tags = TFL_DEFAULT_TAGS,
+    tags = TFL_DEFAULT_TAGS + ["builder_default_ios_x86_64"],
     deps = [
         ":TensorFlowLite",
     ],
@@ -135,7 +136,10 @@ objc_library(
         "apis",
     ],
     module_name = "TestApp",
-    tags = TFL_DEFAULT_TAGS + ["manual"],
+    tags = TFL_DEFAULT_TAGS + [
+        "manual",
+        "builder_default_ios_x86_64",
+    ],
     deps = [
         ":TensorFlowLite",
     ],
diff --git a/tensorflow/lite/experimental/objc/TensorFlowLiteObjC-nightly.podspec b/tensorflow/lite/experimental/objc/TensorFlowLiteObjC-nightly.podspec
index e039fb57114..eed0f087f44 100644
--- a/tensorflow/lite/experimental/objc/TensorFlowLiteObjC-nightly.podspec
+++ b/tensorflow/lite/experimental/objc/TensorFlowLiteObjC-nightly.podspec
@@ -26,6 +26,7 @@ Pod::Spec.new do |s|
     objc_dir + '{apis,sources}/*.{h,m,mm}',
     tfl_dir + 'c/c_api.h',
     tfl_dir + 'c/common.h',
+    tfl_dir + 'delegates/xnnpack/xnnpack_delegate.h',
   ]
   s.module_map = objc_dir + 'apis/framework.modulemap'
   s.dependency 'TensorFlowLiteC', "~> #{s.version}"
diff --git a/tensorflow/lite/experimental/objc/TensorFlowLiteObjC.podspec b/tensorflow/lite/experimental/objc/TensorFlowLiteObjC.podspec
index c673cfad759..5817619a58f 100644
--- a/tensorflow/lite/experimental/objc/TensorFlowLiteObjC.podspec
+++ b/tensorflow/lite/experimental/objc/TensorFlowLiteObjC.podspec
@@ -26,6 +26,7 @@ Pod::Spec.new do |s|
     objc_dir + '{apis,sources}/*.{h,m,mm}',
     tfl_dir + 'c/c_api.h',
     tfl_dir + 'c/common.h',
+    tfl_dir + 'delegates/xnnpack/xnnpack_delegate.h',
   ]
   s.module_map = objc_dir + 'apis/framework.modulemap'
   s.dependency 'TensorFlowLiteC', "#{s.version}"
diff --git a/tensorflow/lite/experimental/objc/TensorFlowLiteObjC.podspec.template b/tensorflow/lite/experimental/objc/TensorFlowLiteObjC.podspec.template
index fc9e10e4a2c..4ab5753e016 100644
--- a/tensorflow/lite/experimental/objc/TensorFlowLiteObjC.podspec.template
+++ b/tensorflow/lite/experimental/objc/TensorFlowLiteObjC.podspec.template
@@ -26,6 +26,7 @@ Pod::Spec.new do |s|
     objc_dir + '{apis,sources}/*.{h,m,mm}',
     tfl_dir + 'c/c_api.h',
     tfl_dir + 'c/common.h',
+    tfl_dir + 'delegates/xnnpack/xnnpack_delegate.h',
   ]
   s.module_map = objc_dir + 'apis/framework.modulemap'
   s.dependency 'TensorFlowLiteC', '~> 0.0.1-nightly'
diff --git a/tensorflow/lite/experimental/objc/apis/TFLInterpreterOptions.h b/tensorflow/lite/experimental/objc/apis/TFLInterpreterOptions.h
index 6461fbf0178..d7dbb2bd970 100644
--- a/tensorflow/lite/experimental/objc/apis/TFLInterpreterOptions.h
+++ b/tensorflow/lite/experimental/objc/apis/TFLInterpreterOptions.h
@@ -25,6 +25,27 @@ NS_ASSUME_NONNULL_BEGIN
  */
 @property(nonatomic) NSUInteger numberOfThreads;
 
+/**
+ * Experimental: Enable an optimized set of floating point CPU kernels (provided by XNNPACK).
+ *
+ * Enabling this flag will enable use of a new, highly optimized set of CPU kernels provided via the
+ * XNNPACK delegate. Currently, this is restricted to a subset of floating point operations.
+ * Eventually, we plan to enable this by default, as it can provide significant performance benefits
+ * for many classes of floating point models. See
+ * https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/delegates/xnnpack/README.md
+ * for more details.
+ *
+ * Things to keep in mind when enabling this flag:
+ *
+ *     * Startup time and resize time may increase.
+ *     * Baseline memory consumption may increase.
+ *     * Compatibility with other delegates (e.g., GPU) has not been fully validated.
+ *     * Quantized models will not see any benefit.
+ *
+ * WARNING: This is an experimental interface that is subject to change.
+ */
+@property(nonatomic) BOOL useXNNPACK;
+
 /**
  * Initializes a new instance of `TFLInterpreterOptions`.
  *
diff --git a/tensorflow/lite/experimental/objc/sources/TFLInterpreter.mm b/tensorflow/lite/experimental/objc/sources/TFLInterpreter.mm
index 94031ee5428..34dd119885d 100644
--- a/tensorflow/lite/experimental/objc/sources/TFLInterpreter.mm
+++ b/tensorflow/lite/experimental/objc/sources/TFLInterpreter.mm
@@ -23,6 +23,7 @@
 #import "tensorflow/lite/experimental/objc/apis/TFLTensor.h"
 
 #include "tensorflow/lite/c/c_api.h"
+#include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
 
 NS_ASSUME_NONNULL_BEGIN
 
@@ -45,6 +46,9 @@ static void TFLInterpreterErrorReporter(void *user_data, const char *format, va_
 /** TfLiteInterpreter backed by C API. */
 @property(nonatomic, nullable) TfLiteInterpreter *interpreter;
 
+/** TfLiteDelegate backed by C API. */
+@property(nonatomic, nullable) TfLiteDelegate *xnnpack_delegate;
+
 @end
 
 @implementation TFLInterpreter
@@ -53,6 +57,7 @@ static void TFLInterpreterErrorReporter(void *user_data, const char *format, va_
 
 - (void)dealloc {
   TfLiteInterpreterDelete(_interpreter);
+  TfLiteXNNPackDelegateDelete(_xnnpack_delegate);
 }
 
 #pragma mark - Public
@@ -104,6 +109,16 @@ static void TFLInterpreterErrorReporter(void *user_data, const char *format, va_
       }
       TfLiteInterpreterOptionsSetErrorReporter(cOptions, TFLInterpreterErrorReporter, nullptr);
 
+      if (options.useXNNPACK) {
+        TfLiteXNNPackDelegateOptions xnnpack_options = TfLiteXNNPackDelegateOptionsDefault();
+        if (options.numberOfThreads > 0) {
+          xnnpack_options.num_threads = (int32_t)options.numberOfThreads;
+        }
+
+        _xnnpack_delegate = TfLiteXNNPackDelegateCreate(&xnnpack_options);
+        TfLiteInterpreterOptionsAddDelegate(cOptions, _xnnpack_delegate);
+      }
+
       _interpreter = TfLiteInterpreterCreate(model, cOptions);
       if (_interpreter == nullptr) {
         [TFLErrorUtil saveInterpreterErrorWithCode:TFLInterpreterErrorCodeFailedToCreateInterpreter
diff --git a/tensorflow/lite/experimental/objc/tests/TFLInterpreterOptionsTests.m b/tensorflow/lite/experimental/objc/tests/TFLInterpreterOptionsTests.m
index 00b800d6af9..286cba98b49 100644
--- a/tensorflow/lite/experimental/objc/tests/TFLInterpreterOptionsTests.m
+++ b/tensorflow/lite/experimental/objc/tests/TFLInterpreterOptionsTests.m
@@ -32,6 +32,7 @@ NS_ASSUME_NONNULL_BEGIN
   TFLInterpreterOptions *options = [[TFLInterpreterOptions alloc] init];
   XCTAssertNotNil(options);
   XCTAssertEqual(options.numberOfThreads, 0);
+  XCTAssertFalse(options.useXNNPACK);
 }
 
 - (void)testSetNumberOfThread {
@@ -44,6 +45,14 @@ NS_ASSUME_NONNULL_BEGIN
   XCTAssertEqual(options.numberOfThreads, 3);
 }
 
+- (void)testUseXNNPACK {
+  TFLInterpreterOptions *options = [[TFLInterpreterOptions alloc] init];
+  options.useXNNPACK = YES;
+  XCTAssertTrue(options.useXNNPACK);
+  options.useXNNPACK = NO;
+  XCTAssertFalse(options.useXNNPACK);
+}
+
 @end
 
 NS_ASSUME_NONNULL_END
diff --git a/tensorflow/lite/experimental/support/java/README.md b/tensorflow/lite/experimental/support/java/README.md
index d5f3e121f3a..c8565e4bf36 100644
--- a/tensorflow/lite/experimental/support/java/README.md
+++ b/tensorflow/lite/experimental/support/java/README.md
@@ -13,222 +13,5 @@ especially around:
 *   Use-cases we should support including data types and operations
 *   Ease of use - does the APIs make sense to the community
 
-## Table of Contents
-
-*   [Getting Started](#getting-started)
-    *   [Import Gradle dependency and other settings](#Import-Gradle-dependency-and-other-settings)
-    *   [Basic image manipulation and conversion](#Basic-image-manipulation-and-conversion)
-    *   [Create output objects and run the model](#Create-output-objects-and-run-the-model)
-    *   [Accessing the result](#Accessing-the-result)
-    *   [Optional: Mapping results to labels](#Optional-Mapping-results-to-labels)
-*   [Current use-case coverage](#Current-use-case-coverage)
-*   [ImageProcessor Architecture](#ImageProcessor-Architecture)
-*   [Quantization](#Quantization)
-
-## Getting Started
-
-### Import Gradle dependency and other settings
-
-Copy the .tflite model file to the assets directory for the Android module where
-the model will be run. Specify that the file should not be compressed, and add
-the TensorFlow Lite library to the module’s build.gradle file:
-
-```java
-android {
-    // Other settings
-
-    // Specify tflite file should not be compressed for the app apk
-    aaptOptions {
-        noCompress "tflite"
-    }
-
-}
-
-dependencies {
-    // Other dependencies
-
-    // Import tflite dependencies
-    implementation 'org.tensorflow:tensorflow-lite:0.0.0-nightly'
-    implementation 'org.tensorflow:tensorflow-lite-gpu:0.0.0-nightly'
-    implementation 'org.tensorflow:tensorflow-lite-support:0.0.0-nightly'
-}
-```
-
-### Basic image manipulation and conversion
-
-The TensorFlow Lite Support Library has a suite of basic image manipulation
-methods such as crop and resize. To use it, create an ImagePreprocessor and add
-the required operations. To convert the image into the tensor format required by
-the TensorFlow Lite interpreter, create a TensorImage to be used as input:
-
-```java
-import org.tensorflow.lite.support.image.ImageProcessor;
-import org.tensorflow.lite.support.image.TensorImage;
-import org.tensorflow.lite.support.image.ops.ResizeOp;
-
-// Initialization code
-// Create an ImageProcessor with all ops required. For more ops, please
-// refer to the ImageProcessor Architecture section in this README.
-ImageProcessor imageProcessor =
-    new ImageProcessor.Builder()
-        .add(new ResizeOp(224, 224, ResizeOp.ResizeMethod.BILINEAR))
-        .build();
-
-// Create a TensorImage object, this creates the tensor the TensorFlow Lite
-// interpreter needs
-TensorImage tImage = new TensorImage(DataType.UINT8);
-
-// Analysis code for every frame
-// Preprocess the image
-tImage.load(bitmap);
-tImage = imageProcessor.process(tImage);
-```
-
-### Create output objects and run the model
-
-Before running the model, we need to create the container objects that will
-store the result:
-
-```java
-import org.tensorflow.lite.support.tensorbuffer.TensorBuffer;
-
-// Create a container for the result and specify that this is a quantized model.
-// Hence, the 'DataType' is defined as UINT8 (8-bit unsigned integer)
-TensorBuffer probabilityBuffer =
-    TensorBuffer.createFixedSize(new int[]{1, 1001}, DataType.UINT8);
-```
-
-Loading the model and running inference:
-
-```java
-import org.tensorflow.lite.support.model.Model;
-
-// Initialise the model
-try{
-    MappedByteBuffer tfliteModel
-        = FileUtil.loadMappedFile(activity,
-            "mobilenet_v1_1.0_224_quant.tflite");
-    Interpreter tflite = new Interpreter(tfliteModel)
-} catch (IOException e){
-    Log.e("tfliteSupport", "Error reading model", e);
-}
-
-// Running inference
-if(null != tflite) {
-    tflite.run(tImage.getBuffer(), probabilityBuffer.getBuffer());
-}
-```
-
-### Accessing the result
-
-Developers can access the output directly through
-`probabilityBuffer.getFloatArray()`. If the model produces a quantized output,
-remember to convert the result. For the MobileNet quantized model, the developer
-needs to divide each output value by 255 to obtain the probability ranging from
-0 (least likely) to 1 (most likely) for each category.
-
-### Optional: Mapping results to labels
-
-Developers can also optionally map the results to labels. First, copy the text
-file containing labels into the module’s assets directory. Next, load the label
-file using the following code:
-
-```java
-import org.tensorflow.lite.support.common.FileUtil;
-
-final String ASSOCIATED_AXIS_LABELS = "labels.txt";
-List<String> associatedAxisLabels = null;
-
-try {
-    associatedAxisLabels = FileUtil.loadLabels(this, ASSOCIATED_AXIS_LABELS);
-} catch (IOException e) {
-    Log.e("tfliteSupport", "Error reading label file", e);
-}
-```
-
-The following snippet demonstrates how to associate the probabilities with
-category labels:
-
-```java
-import org.tensorflow.lite.support.common.TensorProcessor;
-import org.tensorflow.lite.support.label.TensorLabel;
-
-// Post-processor which dequantize the result
-TensorProcessor probabilityProcessor =
-    new TensorProcessor.Builder().add(new NormalizeOp(0, 255)).build();
-
-if (null != associatedAxisLabels) {
-    // Map of labels and their corresponding probability
-    TensorLabel labels = new TensorLabel(associatedAxisLabels,
-        probabilityProcessor.process(probabilityBuffer));
-
-    // Create a map to access the result based on label
-    Map<String, Float> floatMap = labels.getMapWithFloatValue();
-}
-```
-
-## Current use-case coverage
-
-The current experimental version of the TensorFlow Lite Support Library covers:
-
-*   common data types (float, uint8, images and array of these objects) as
-    inputs and outputs of tflite models.
-*   basic image operations (crop image, resize and rotate).
-*   quantized and float models.
-
-Future versions will improve support for text-related applications.
-
-## ImageProcessor Architecture
-
-The design of the `ImageProcessor` allowed the image manipulation operations to
-be defined up front and optimised during the build process. The `ImageProcessor`
-currently supports three basic preprocessing operations:
-
-```java
-int width = bitmap.getWidth();
-int height = bitmap.getHeight();
-
-int size = height > width ? width : height;
-
-ImageProcessor imageProcessor =
-    new ImageProcessor.Builder()
-        // Center crop the image to the largest square possible
-        .add(new ResizeWithCropOrPadOp(size, size))
-        // Resize using Bilinear or Nearest neighbour
-        .add(new ResizeOp(224, 224, ResizeOp.ResizeMethod.BILINEAR));
-        // Rotation counter-clockwise in 90 degree increments
-        .add(new Rot90Op(rotateDegrees / 90))
-        .build();
-```
-
-The eventual goal of the support library is to support all
-[`tf.image`](https://www.tensorflow.org/api_docs/python/tf/image)
-transformations. This means the transformation will be the same as TensorFlow
-and the implementation will be independent of the operating system.
-
-Developers are also welcome to create custom processors. It is important in
-these cases to be aligned with the training process - i.e. the same
-preprocessing should apply to both training and inference to increase
-reproducibility.
-
-## Quantization
-
-When initiating input or output objects such as `TensorImage` or `TensorBuffer`
-the developer will need to specify whether they are to be quantized, by
-specifying their type to be `DataType.UINT8` or `DataType.FLOAT32`.
-
-The `TensorProcessor` can be used to quantize input tensors or dequantize output
-tensors. For example, when processing a quantized output `TensorBuffer`, the
-developer can use `NormalizeOp` to dequantize the result to a floating point
-probability between 0 and 1:
-
-```java
-import org.tensorflow.lite.support.common.TensorProcessor;
-
-// Post-processor which dequantize the result
-TensorProcessor probabilityProcessor =
-    new TensorProcessor.Builder().add(new NormalizeOp(0, 255)).build();
-
-// Post-processor which dequantize the result
-TensorBuffer dequantizedBuffer = probabilityProcessor.process(probabilityBuffer);
-```
+See the [documentation](https://www.tensorflow.org/lite/guide/lite_support) for
+instruction and examples.
diff --git a/tensorflow/lite/experimental/support/metadata/BUILD b/tensorflow/lite/experimental/support/metadata/BUILD
index 4621c8c55d2..ba410d914c7 100644
--- a/tensorflow/lite/experimental/support/metadata/BUILD
+++ b/tensorflow/lite/experimental/support/metadata/BUILD
@@ -1,5 +1,6 @@
 load("//tensorflow:tensorflow.bzl", "py_test")
 load("@flatbuffers//:build_defs.bzl", "flatbuffer_android_library", "flatbuffer_cc_library", "flatbuffer_java_library", "flatbuffer_py_library")
+load("//tensorflow/lite/experimental/support/metadata:build_defs.bzl", "stamp_metadata_parser_version")
 
 package(
     default_visibility = [
@@ -51,9 +52,19 @@ flatbuffer_android_library(
     custom_package = "org.tensorflow.lite.support.metadata.schema",
 )
 
+# TODO(b/157813075): move the metadata python library to metadata/python/ when migrating to the new repo.
+stamp_metadata_parser_version(
+    name = "metadata_parser_py",
+    srcs = ["metadata_parser.py.template"],
+    outs = ["metadata_parser.py"],
+)
+
 py_library(
     name = "metadata",
-    srcs = ["metadata.py"],
+    srcs = [
+        "metadata.py",
+        ":metadata_parser_py",
+    ],
     data = [
         "//tensorflow/lite/experimental/support/metadata:metadata_schema.fbs",
     ],
@@ -89,3 +100,14 @@ py_test(
         "@six_archive//:six",
     ],
 )
+
+py_test(
+    name = "metadata_parser_test",
+    srcs = ["metadata_parser_test.py"],
+    python_version = "PY3",
+    srcs_version = "PY2AND3",
+    deps = [
+        ":metadata",
+        "//tensorflow/python:client_testlib",
+    ],
+)
diff --git a/tensorflow/lite/experimental/support/metadata/build_defs.bzl b/tensorflow/lite/experimental/support/metadata/build_defs.bzl
new file mode 100644
index 00000000000..3ea945770e0
--- /dev/null
+++ b/tensorflow/lite/experimental/support/metadata/build_defs.bzl
@@ -0,0 +1,43 @@
+"""Build rules to generate metadata schema versions."""
+
+METADATA_SCHEMA_FILE = "//tensorflow/lite/experimental/support/metadata:metadata_schema.fbs"
+
+def stamp_metadata_parser_version(
+        name,
+        srcs,
+        outs):
+    """Stamps the latest metadata parser version into the srcs files.
+
+    Replaces all the occurrences of "{LATEST_METADATA_PARSER_VERSION}" in the
+    srcs files with the metadata schema version extracted from
+    METADATA_SCHEMA_FILE and then outputs the generated file into outs,
+    respectively. The number of srcs files needs to match the number of outs
+    files.
+
+    Args:
+        name: Rule name. (required)
+        srcs: List of source files. (required)
+        outs: List of output files. (required)
+    """
+    if len(srcs) != len(outs):
+        fail(("The number of srcs files (%d) does not match that of the outs" +
+              " files (%d).") %
+             (len(srcs), len(outs)))
+
+    for i in range(0, len(srcs)):
+        native.genrule(
+            name = "%s_file%d" % (name, i),
+            srcs = [srcs[i]],
+            outs = [outs[i]],
+            tools = [METADATA_SCHEMA_FILE],
+            # Gets the metadata schema version from the file, and stamps it
+            # into the srcs file.
+            cmd = "version=$$(sed -n -e '/Schema Semantic version/ s/.*\\: *//p' $(location %s));" %
+                  METADATA_SCHEMA_FILE +
+                  'sed "s/{LATEST_METADATA_PARSER_VERSION}/$$version/" $< > $@',
+        )
+
+    native.filegroup(
+        name = name,
+        srcs = outs,
+    )
diff --git a/tensorflow/lite/experimental/support/metadata/cc/BUILD b/tensorflow/lite/experimental/support/metadata/cc/BUILD
index 832e2edb56d..8febc7a2237 100644
--- a/tensorflow/lite/experimental/support/metadata/cc/BUILD
+++ b/tensorflow/lite/experimental/support/metadata/cc/BUILD
@@ -1,12 +1,23 @@
+load("//tensorflow/lite/experimental/support/metadata:build_defs.bzl", "stamp_metadata_parser_version")
+
 package(
     default_visibility = ["//tensorflow/lite/experimental/support:users"],
     licenses = ["notice"],  # Apache 2.0
 )
 
+stamp_metadata_parser_version(
+    name = "metadata_parser_h",
+    srcs = ["metadata_parser.h.template"],
+    outs = ["metadata_parser.h"],
+)
+
 cc_library(
     name = "metadata_version",
     srcs = ["metadata_version.cc"],
-    hdrs = ["metadata_version.h"],
+    hdrs = [
+        "metadata_version.h",
+        ":metadata_parser_h",
+    ],
     deps = [
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/experimental/support/metadata:metadata_schema_cc",
diff --git a/tensorflow/lite/experimental/support/metadata/cc/metadata_parser.h.template b/tensorflow/lite/experimental/support/metadata/cc/metadata_parser.h.template
new file mode 100644
index 00000000000..dfb62d0de81
--- /dev/null
+++ b/tensorflow/lite/experimental/support/metadata/cc/metadata_parser.h.template
@@ -0,0 +1,28 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SUPPORT_METADATA_CC_METADATA_PARSER_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_SUPPORT_METADATA_CC_METADATA_PARSER_H_
+
+namespace tflite {
+namespace metadata {
+
+// The version of the metadata parser that this metadata versioning library is
+// depending on.
+inline constexpr char kMatadataParserVersion[] = "{LATEST_METADATA_PARSER_VERSION}";
+
+}  // namespace metadata
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SUPPORT_METADATA_CC_METADATA_PARSER_H_
diff --git a/tensorflow/lite/experimental/support/metadata/cc/test/BUILD b/tensorflow/lite/experimental/support/metadata/cc/test/BUILD
index fd829124c73..f9d78567d70 100644
--- a/tensorflow/lite/experimental/support/metadata/cc/test/BUILD
+++ b/tensorflow/lite/experimental/support/metadata/cc/test/BUILD
@@ -13,3 +13,12 @@ cc_test(
         "@flatbuffers",
     ],
 )
+
+cc_test(
+    name = "metadata_parser_test",
+    srcs = ["metadata_parser_test.cc"],
+    deps = [
+        "//tensorflow/lite/experimental/support/metadata/cc:metadata_version",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
diff --git a/tensorflow/lite/experimental/support/metadata/cc/test/metadata_parser_test.cc b/tensorflow/lite/experimental/support/metadata/cc/test/metadata_parser_test.cc
new file mode 100644
index 00000000000..af7b8791fe8
--- /dev/null
+++ b/tensorflow/lite/experimental/support/metadata/cc/test/metadata_parser_test.cc
@@ -0,0 +1,33 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/experimental/support/metadata/cc/metadata_parser.h"
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+
+namespace tflite {
+namespace metadata {
+namespace {
+
+using ::testing::MatchesRegex;
+
+TEST(MetadataParserTest, MatadataParserVersionIsWellFormed) {
+  // Validates that the version is well-formed (x.y.z).
+  EXPECT_THAT(kMatadataParserVersion, MatchesRegex("[0-9]+\\.[0-9]+\\.[0-9]+"));
+}
+
+}  // namespace
+}  // namespace metadata
+}  // namespace tflite
diff --git a/tensorflow/lite/experimental/support/metadata/cc/test/metadata_version_test.cc b/tensorflow/lite/experimental/support/metadata/cc/test/metadata_version_test.cc
index 02ecfdbd232..03f4d3bf28b 100644
--- a/tensorflow/lite/experimental/support/metadata/cc/test/metadata_version_test.cc
+++ b/tensorflow/lite/experimental/support/metadata/cc/test/metadata_version_test.cc
@@ -44,7 +44,7 @@ TEST(MetadataVersionTest,
                                             builder.GetSize(), &min_version),
             kTfLiteOk);
   // Validates that the version is well-formed (x.y.z).
-  EXPECT_THAT(min_version, MatchesRegex("[0-9]*\\.[0-9]*\\.[0-9]"));
+  EXPECT_THAT(min_version, MatchesRegex("[0-9]+\\.[0-9]+\\.[0-9]+"));
 }
 
 TEST(MetadataVersionTest,
diff --git a/tensorflow/lite/experimental/support/metadata/java/BUILD b/tensorflow/lite/experimental/support/metadata/java/BUILD
index c208752ae24..00d10bcca56 100644
--- a/tensorflow/lite/experimental/support/metadata/java/BUILD
+++ b/tensorflow/lite/experimental/support/metadata/java/BUILD
@@ -9,9 +9,13 @@ package(
     licenses = ["notice"],  # Apache 2.0
 )
 
+METADATA_SRCS = glob(
+    ["src/java/org/tensorflow/lite/support/metadata/**/*.java"],
+)
+
 android_library(
     name = "tensorflow-lite-support-metadata",
-    srcs = glob(["src/java/org/tensorflow/lite/support/metadata/**/*.java"]),
+    srcs = METADATA_SRCS,
     manifest = "AndroidManifest.xml",
     deps = [
         "//tensorflow/lite/experimental/support/metadata:metadata_schema_fbs_android",
@@ -22,7 +26,7 @@ android_library(
 
 java_library(
     name = "tensorflow-lite-support-metadata-lib",
-    srcs = glob(["src/java/org/tensorflow/lite/support/metadata/**/*.java"]),
+    srcs = METADATA_SRCS,
     javacopts = JAVACOPTS,
     resource_jars = [
         "//tensorflow/lite/experimental/support/metadata:libmetadata_schema_java.jar",
diff --git a/tensorflow/lite/experimental/support/metadata/java/src/java/org/tensorflow/lite/support/metadata/MetadataExtractor.java b/tensorflow/lite/experimental/support/metadata/java/src/java/org/tensorflow/lite/support/metadata/MetadataExtractor.java
index 9da5b59cf46..9bf5ae93138 100644
--- a/tensorflow/lite/experimental/support/metadata/java/src/java/org/tensorflow/lite/support/metadata/MetadataExtractor.java
+++ b/tensorflow/lite/experimental/support/metadata/java/src/java/org/tensorflow/lite/support/metadata/MetadataExtractor.java
@@ -52,10 +52,6 @@ import org.tensorflow.lite.support.metadata.schema.TensorMetadata;
  * MetadataExtractor} omits subgraph index as an input in its methods.
  */
 public class MetadataExtractor {
-  // TODO(b/156539454): remove the hardcode versioning number and populate the version through
-  // genrule.
-  /** The version of the metadata parser that this {@link MetadataExtractor} library depends on. */
-  public static final String METADATA_PARSER_VERSION = "1.0.1";
 
   /** The helper class to load metadata from TFLite model FlatBuffer. */
   private final ModelInfo modelInfo;
@@ -85,7 +81,7 @@ public class MetadataExtractor {
         System.err.printf(
             "<Warning> Some fields in the metadata belong to a future schema. The minimum parser"
                 + " version required is %s, but the version of the current metadata parser is %s",
-            metadataInfo.getMininumParserVersion(), METADATA_PARSER_VERSION);
+            metadataInfo.getMininumParserVersion(), MetadataParser.VERSION);
       }
 
       checkArgument(
@@ -290,7 +286,7 @@ public class MetadataExtractor {
     if (minVersion == null) {
       return true;
     }
-    return compareVersions(minVersion, METADATA_PARSER_VERSION) <= 0;
+    return compareVersions(minVersion, MetadataParser.VERSION) <= 0;
   }
 
   /**
diff --git a/tensorflow/lite/experimental/support/metadata/java/src/java/org/tensorflow/lite/support/metadata/MetadataParser.java b/tensorflow/lite/experimental/support/metadata/java/src/java/org/tensorflow/lite/support/metadata/MetadataParser.java
new file mode 100644
index 00000000000..195a330462b
--- /dev/null
+++ b/tensorflow/lite/experimental/support/metadata/java/src/java/org/tensorflow/lite/support/metadata/MetadataParser.java
@@ -0,0 +1,27 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow.lite.support.metadata;
+
+/** Information about the metadata parser that this metadata extractor library is depending on. */
+public final class MetadataParser {
+  /**
+   * The version of the metadata parser that this metadata extractor library is depending on. The
+   * value should match the value of "Schema Semantic version" in metadata_schema.fbs.
+   */
+  public static final String VERSION = "1.0.1";
+
+  private MetadataParser() {}
+}
diff --git a/tensorflow/lite/experimental/support/metadata/metadata_parser.py.template b/tensorflow/lite/experimental/support/metadata/metadata_parser.py.template
new file mode 100644
index 00000000000..a41ac06969c
--- /dev/null
+++ b/tensorflow/lite/experimental/support/metadata/metadata_parser.py.template
@@ -0,0 +1,26 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Information about the metadata parser that this python library depends on."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+
+class MetadataParser(object):
+  """Information about the metadata parser."""
+
+  # The version of the metadata parser.
+  VERSION = "{LATEST_METADATA_PARSER_VERSION}"
diff --git a/tensorflow/lite/experimental/support/metadata/metadata_parser_test.py b/tensorflow/lite/experimental/support/metadata/metadata_parser_test.py
new file mode 100644
index 00000000000..3b1d19278cd
--- /dev/null
+++ b/tensorflow/lite/experimental/support/metadata/metadata_parser_test.py
@@ -0,0 +1,38 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for tensorflow.lite.experimental.support.metadata.metadata_parser."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import re
+
+from tensorflow.lite.experimental.support.metadata import metadata_parser
+from tensorflow.python.framework import test_util
+from tensorflow.python.platform import test
+
+
+class MetadataParserTest(test_util.TensorFlowTestCase):
+
+  def test_version_wellFormedSemanticVersion(self):
+    # Validates that the version is well-formed (x.y.z).
+    self.assertTrue(
+        re.match('[0-9]+\\.[0-9]+\\.[0-9]+',
+                 metadata_parser.MetadataParser.VERSION))
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/lite/experimental/support/metadata/metadata_schema.fbs b/tensorflow/lite/experimental/support/metadata/metadata_schema.fbs
index 53c26b3e079..a88225f1960 100644
--- a/tensorflow/lite/experimental/support/metadata/metadata_schema.fbs
+++ b/tensorflow/lite/experimental/support/metadata/metadata_schema.fbs
@@ -29,18 +29,31 @@ namespace tflite;
 // generate the model interface. It is recommended to fill in at least those
 // enties to boost the codegen performance.
 
-// LINT.IfChange
-
-// The Metadata schema is versioned by the Semantic versioning number, which
-// tracks the schema changes according to the Semantic versioning rules.
+// The Metadata schema is versioned by the Semantic versioning number, such as
+// MAJOR.MINOR.PATCH. It tracks the schema changes according to the rules below:
+//  * Bump up the MAJOR number when making potentially backwards incompatible
+//    changes. It must be incremented if the new changes break the backwards
+//    compatibility. It may also include minor and patch level changes as
+//    needed. The true backwards compatibility is indicated by the file
+//    identifier.
+//  * Bump up the MINOR number when making backwards compatible updates for
+//    major features, such as supporting new content types or adding new
+//    processing units.
+//  * Bump up the PATCH number when making small backwards compatible changes,
+//    such as adding a new fields or deprecating certain fields (not deleting
+//    them).
 //
 // ModelMetadata.min_parser_version indicates the minimum necessary metadata
 // parser version to fully understand all fields in a given metadata flatbuffer.
 //
-// New fields and types will have associated comments with the schema version for
-// which they were added.
+// New fields and types will have associated comments with the schema version
+// for which they were added.
 //
+// LINT.IfChange
 // Schema Semantic version: 1.0.1
+// LINT.ThenChange(//tensorflow/lite/experimental/\
+//.    support/metadata/java/src/java/org/tensorflow/lite/support/metadata/\
+//.    MetadataParser.java)
 
 // This indicates the flatbuffer compatibility. The number will bump up when a
 // break change is applied to the schema, such as removing fields or adding new
@@ -53,10 +66,6 @@ file_identifier "M001";
 // File extension of any written files.
 file_extension "tflitemeta";
 
-// LINT.ThenChange(//tensorflow/lite/experimental/\
-//     /support/metadata/java/src/java/org/tensorflow/lite/support/metadata/\
-//     MetadataExtractor.java)
-
 // LINT.IfChange
 enum AssociatedFileType : byte {
   UNKNOWN = 0,
diff --git a/tensorflow/lite/experimental/swift/Sources/Interpreter.swift b/tensorflow/lite/experimental/swift/Sources/Interpreter.swift
index b83c36c4e1d..3567822208d 100644
--- a/tensorflow/lite/experimental/swift/Sources/Interpreter.swift
+++ b/tensorflow/lite/experimental/swift/Sources/Interpreter.swift
@@ -39,6 +39,9 @@ public final class Interpreter {
   /// The underlying `TfLiteInterpreter` C pointer.
   private var cInterpreter: CInterpreter?
 
+  /// The underlying `TfLiteDelegate` C pointer for XNNPACK delegate.
+  private var cXNNPackDelegate: Delegate.CDelegate?
+
   /// Creates a new instance with the given values.
   ///
   /// - Parameters:
@@ -78,6 +81,14 @@ public final class Interpreter {
       )
     }
     delegates?.forEach { TfLiteInterpreterOptionsAddDelegate(cInterpreterOptions, $0.cDelegate) }
+
+    // Configure the XNNPack delegate after the other delegates explicitly added by the user.
+    options.map {
+      if $0.isXNNPackEnabled {
+        configureXNNPack(options: $0, cInterpreterOptions: cInterpreterOptions)
+      }
+    }
+
     guard let cInterpreter = TfLiteInterpreterCreate(model.cModel, cInterpreterOptions) else {
       throw InterpreterError.failedToCreateInterpreter
     }
@@ -86,6 +97,7 @@ public final class Interpreter {
 
   deinit {
     TfLiteInterpreterDelete(cInterpreter)
+    TfLiteXNNPackDelegateDelete(cXNNPackDelegate)
   }
 
   /// Invokes the interpreter to perform inference from the loaded graph.
@@ -201,12 +213,13 @@ public final class Interpreter {
     guard case 0...maxIndex = index else {
       throw InterpreterError.invalidTensorIndex(index: index, maxIndex: maxIndex)
     }
-    guard TfLiteInterpreterResizeInputTensor(
-      cInterpreter,
-      Int32(index),
-      shape.int32Dimensions,
-      Int32(shape.rank)
-    ) == kTfLiteOk
+    guard
+      TfLiteInterpreterResizeInputTensor(
+        cInterpreter,
+        Int32(index),
+        shape.int32Dimensions,
+        Int32(shape.rank)
+      ) == kTfLiteOk
     else {
       throw InterpreterError.failedToResizeInputTensor(index: index)
     }
@@ -236,11 +249,11 @@ public final class Interpreter {
     }
 
     #if swift(>=5.0)
-    let status = data.withUnsafeBytes {
-      TfLiteTensorCopyFromBuffer(cTensor, $0.baseAddress, data.count)
-    }
+      let status = data.withUnsafeBytes {
+        TfLiteTensorCopyFromBuffer(cTensor, $0.baseAddress, data.count)
+      }
     #else
-    let status = data.withUnsafeBytes { TfLiteTensorCopyFromBuffer(cTensor, $0, data.count) }
+      let status = data.withUnsafeBytes { TfLiteTensorCopyFromBuffer(cTensor, $0, data.count) }
     #endif  // swift(>=5.0)
     guard status == kTfLiteOk else { throw InterpreterError.failedToCopyDataToInputTensor }
     return try input(at: index)
@@ -256,6 +269,18 @@ public final class Interpreter {
       throw InterpreterError.failedToAllocateTensors
     }
   }
+
+  // MARK: - Private
+
+  private func configureXNNPack(options: Options, cInterpreterOptions: OpaquePointer) {
+    var cXNNPackOptions = TfLiteXNNPackDelegateOptionsDefault()
+    if let threadCount = options.threadCount, threadCount > 0 {
+      cXNNPackOptions.num_threads = Int32(threadCount)
+    }
+
+    cXNNPackDelegate = TfLiteXNNPackDelegateCreate(&cXNNPackOptions)
+    TfLiteInterpreterOptionsAddDelegate(cInterpreterOptions, cXNNPackDelegate)
+  }
 }
 
 extension Interpreter {
@@ -265,6 +290,28 @@ extension Interpreter {
     /// indicating that the `Interpreter` will decide the number of threads to use.
     public var threadCount: Int? = nil
 
+    /// Indicates whether an optimized set of floating point CPU kernels, provided by XNNPACK, is
+    /// enabled.
+    ///
+    /// - Experiment:
+    /// Enabling this flag will enable use of a new, highly optimized set of CPU kernels provided
+    /// via the XNNPACK delegate. Currently, this is restricted to a subset of floating point
+    /// operations. Eventually, we plan to enable this by default, as it can provide significant
+    /// performance benefits for many classes of floating point models. See
+    /// https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/delegates/xnnpack/README.md
+    /// for more details.
+    ///
+    /// - Important:
+    /// Things to keep in mind when enabling this flag:
+    ///
+    ///     * Startup time and resize time may increase.
+    ///     * Baseline memory consumption may increase.
+    ///     * Compatibility with other delegates (e.g., GPU) has not been fully validated.
+    ///     * Quantized models will not see any benefit.
+    ///
+    /// - Warning: This is an experimental interface that is subject to change.
+    public var isXNNPackEnabled: Bool = false
+
     /// Creates a new instance with the default values.
     public init() {}
   }
diff --git a/tensorflow/lite/experimental/swift/Tests/InterpreterTests.swift b/tensorflow/lite/experimental/swift/Tests/InterpreterTests.swift
index 8d0140279af..67d8120df4d 100644
--- a/tensorflow/lite/experimental/swift/Tests/InterpreterTests.swift
+++ b/tensorflow/lite/experimental/swift/Tests/InterpreterTests.swift
@@ -142,10 +142,12 @@ class InterpreterTests: XCTestCase {
   }
 
   func testResizeInputTensorAtIndexToShape_ThrowsInvalidIndex() {
-    XCTAssertThrowsError(try interpreter.resizeInput(
-      at: AddModel.invalidIndex,
-      to: [2, 2, 3]
-    )) { error in
+    XCTAssertThrowsError(
+      try interpreter.resizeInput(
+        at: AddModel.invalidIndex,
+        to: [2, 2, 3]
+      )
+    ) { error in
       let maxIndex = AddModel.inputTensorCount - 1
       self.assertEqualErrors(
         actual: error,
@@ -162,10 +164,12 @@ class InterpreterTests: XCTestCase {
   }
 
   func testCopyDataToInputTensorAtIndex_ThrowsInvalidIndex() {
-    XCTAssertThrowsError(try interpreter.copy(
-      AddModel.inputData,
-      toInputAt: AddModel.invalidIndex
-    )) { error in
+    XCTAssertThrowsError(
+      try interpreter.copy(
+        AddModel.inputData,
+        toInputAt: AddModel.invalidIndex
+      )
+    ) { error in
       let maxIndex = AddModel.inputTensorCount - 1
       self.assertEqualErrors(
         actual: error,
@@ -178,10 +182,12 @@ class InterpreterTests: XCTestCase {
     try interpreter.resizeInput(at: AddModel.validIndex, to: AddModel.shape)
     try interpreter.allocateTensors()
     let invalidData = Data(count: AddModel.dataCount - 1)
-    XCTAssertThrowsError(try interpreter.copy(
-      invalidData,
-      toInputAt: AddModel.validIndex
-    )) { error in
+    XCTAssertThrowsError(
+      try interpreter.copy(
+        invalidData,
+        toInputAt: AddModel.validIndex
+      )
+    ) { error in
       self.assertEqualErrors(
         actual: error,
         expected: .invalidTensorDataCount(provided: invalidData.count, required: AddModel.dataCount)
@@ -223,12 +229,20 @@ class InterpreterOptionsTests: XCTestCase {
   func testInitWithDefaultValues() {
     let options = Interpreter.Options()
     XCTAssertNil(options.threadCount)
+    XCTAssertFalse(options.isXNNPackEnabled)
   }
 
   func testInitWithCustomValues() {
     var options = Interpreter.Options()
+
     options.threadCount = 2
     XCTAssertEqual(options.threadCount, 2)
+
+    options.isXNNPackEnabled = false
+    XCTAssertFalse(options.isXNNPackEnabled)
+
+    options.isXNNPackEnabled = true
+    XCTAssertTrue(options.isXNNPackEnabled)
   }
 
   func testEquatable() {
@@ -242,6 +256,15 @@ class InterpreterOptionsTests: XCTestCase {
 
     options2.threadCount = 3
     XCTAssertNotEqual(options1, options2)
+
+    options2.threadCount = 2
+    XCTAssertEqual(options1, options2)
+
+    options2.isXNNPackEnabled = true
+    XCTAssertNotEqual(options1, options2)
+
+    options1.isXNNPackEnabled = true
+    XCTAssertEqual(options1, options2)
   }
 }
 
@@ -326,14 +349,15 @@ extension Array {
   init?(unsafeData: Data) {
     guard unsafeData.count % MemoryLayout<Element>.stride == 0 else { return nil }
     #if swift(>=5.0)
-    self = unsafeData.withUnsafeBytes { .init($0.bindMemory(to: Element.self)) }
+      self = unsafeData.withUnsafeBytes { .init($0.bindMemory(to: Element.self)) }
     #else
-    self = unsafeData.withUnsafeBytes {
-      .init(UnsafeBufferPointer<Element>(
-        start: $0,
-        count: unsafeData.count / MemoryLayout<Element>.stride
-      ))
-    }
+      self = unsafeData.withUnsafeBytes {
+        .init(
+          UnsafeBufferPointer<Element>(
+            start: $0,
+            count: unsafeData.count / MemoryLayout<Element>.stride
+          ))
+      }
     #endif  // swift(>=5.0)
   }
 }
diff --git a/tensorflow/lite/experimental/writer/enum_mapping.h b/tensorflow/lite/experimental/writer/enum_mapping.h
index b78d610c4c5..5eabbcb2015 100644
--- a/tensorflow/lite/experimental/writer/enum_mapping.h
+++ b/tensorflow/lite/experimental/writer/enum_mapping.h
@@ -29,7 +29,7 @@ inline ActivationFunctionType TfLiteActivationToSchemaActivation(
       return ActivationFunctionType_NONE;
     case kTfLiteActRelu:
       return ActivationFunctionType_RELU;
-    case kTfLiteActRelu1:
+    case kTfLiteActReluN1To1:
       return ActivationFunctionType_RELU_N1_TO_1;
     case kTfLiteActRelu6:
       return ActivationFunctionType_RELU6;
diff --git a/tensorflow/lite/g3doc/_book.yaml b/tensorflow/lite/g3doc/_book.yaml
index 0da3d152090..6c454fab921 100644
--- a/tensorflow/lite/g3doc/_book.yaml
+++ b/tensorflow/lite/g3doc/_book.yaml
@@ -84,15 +84,16 @@ upper_tabs:
         path: /lite/convert/quantization
       - title: "Convert RNN models"
         path: /lite/convert/rnn
-      - title: "1.x compatibility"
-        path: /lite/convert/1x_compatibility
       - title: "Add metadata"
         path: /lite/convert/metadata
-        status: experimental
+      - title: "1.x compatibility"
+        path: /lite/convert/1x_compatibility
 
       - heading: "Inference"
       - title: "Overview"
         path: /lite/guide/inference
+      - title: "Integrate models with metadata"
+        path: /lite/guide/codegen
       - title: "Custom operators"
         path: /lite/guide/ops_custom
       - title: "Operator versions"
@@ -102,11 +103,10 @@ upper_tabs:
       - title: "Select operators from TensorFlow"
         path: /lite/guide/ops_select
         status: experimental
+      - title: "Process input and output data"
+        path: /lite/guide/lite_support
       - title: "List of hosted models"
         path: /lite/guide/hosted_models
-      - title: "Generate code from metadata"
-        path: /lite/guide/codegen
-        status: experimental
 
       - heading: "Performance"
       - title: "Best practices"
@@ -198,6 +198,8 @@ upper_tabs:
       - title: "Overview"
         status: external
         path: /api_docs/python/tf/lite
+      - heading: "Android (Java)"
+      - include: /lite/api_docs/java/_toc.yaml
       - heading: "C++"
       - title: Overview
         path: /lite/api_docs/cc/
diff --git a/tensorflow/lite/g3doc/api_docs/index.md b/tensorflow/lite/g3doc/api_docs/index.md
index 533f5881eb4..5db55fb28a3 100644
--- a/tensorflow/lite/g3doc/api_docs/index.md
+++ b/tensorflow/lite/g3doc/api_docs/index.md
@@ -4,7 +4,7 @@ The API reference documentation provides detailed information for each of the
 classes and methods in the TensorFlow Lite library. Choose your preferred
 platform from the list below.
 
-*   [Python API reference](/api_docs/python/tf/lite)
-*   Android API reference (coming soon)
+*   [Python API reference](https://tensorflow.org/api_docs/python/tf/lite)
+*   [Android (Java) API reference](https://tensorflow.org/lite/api_docs/java/org/tensorflow/lite/package-summary)
 *   iOS API reference (coming soon)
-*   [C++ API reference](/lite/api_docs/cc/)
+*   [C++ API reference](https://tensorflow.org/lite/api_docs/cc)
diff --git a/tensorflow/lite/g3doc/convert/metadata.md b/tensorflow/lite/g3doc/convert/metadata.md
index 29b2c5ce2b3..6cc3e4aad84 100644
--- a/tensorflow/lite/g3doc/convert/metadata.md
+++ b/tensorflow/lite/g3doc/convert/metadata.md
@@ -1,6 +1,4 @@
-# Adding metadata to TensorFlow Lite model
-
-Note: TensorFlow Lite Metadata is in experimental (beta) phase.
+# Adding metadata to TensorFlow Lite models
 
 TensorFlow Lite metadata provides a standard for model descriptions. The
 metadata is an important source of knowledge about what the model does and its
@@ -8,8 +6,15 @@ input / output information. The metadata consists of both
 
 *   human readable parts which convey the best practice when using the model,
     and
-*   machine readable parts that can be leveraged by code generators, such as
-    [the TensorFlow Lite Android code generator](../guide/codegen.md).
+*   machine readable parts that can be leveraged by code generators, such as the
+    [TensorFlow Lite Android code generator](../guide/codegen.md#generate-code-with-tensorflow-lite-android-code-generator)
+    and the
+    [Android Studio ML Binding feature](../guide/codegen.md#generate-code-with-android-studio-ml-model-binding).
+
+All image models published on
+[TensorFlow Lite hosted models](https://www.tensorflow.org/lite/guide/hosted_models)
+and [TensorFlow Hub](https://tfhub.dev/s?deployment-format=lite) have been
+populated with metadata.
 
 ## Setup the metadata tools
 
@@ -28,15 +33,26 @@ TensorFlow Lite metadata tooling supports both Python 2 and Python 3.
 
 ## Adding metadata
 
-There are three parts to the
-[model metadata](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/experimental/support/metadata/metadata_schema.fbs):
+There are three parts to the model metadata in the
+[schema](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/experimental/support/metadata/metadata_schema.fbs):
 
 1.  **Model information** - Overall description of the model as well as items
-    such as licence terms.
+    such as licence terms. See
+    [ModelMetadata](https://github.com/tensorflow/tensorflow/blob/268853ee81edab09e07f455cc918f7ef9a421485/tensorflow/lite/experimental/support/metadata/metadata_schema.fbs#L464).
 2.  **Input information** - Description of the inputs and pre-processing
-    required such as normalization.
+    required such as normalization. See
+    [SubGraphMetadata.input_tensor_metadata](https://github.com/tensorflow/tensorflow/blob/268853ee81edab09e07f455cc918f7ef9a421485/tensorflow/lite/experimental/support/metadata/metadata_schema.fbs#L452).
 3.  **Output information** - Description of the output and post-processing
-    required such as mapping to labels.
+    required such as mapping to labels. See
+    [SubGraphMetadata.output_tensor_metadata](https://github.com/tensorflow/tensorflow/blob/268853ee81edab09e07f455cc918f7ef9a421485/tensorflow/lite/experimental/support/metadata/metadata_schema.fbs#L458).
+
+Since TensorFlow Lite only supports single subgraph at this point, the
+[TensorFlow Lite code generator](../guide/codegen.md#generate-code-with-tensorflow-lite-android-code-generator)
+and the
+[Android Studio ML Binding feature](../guide/codegen.md#generate-code-with-android-studio-ml-model-binding)
+will use `ModelMetadata.name` and `ModelMetadata.description`, instead of
+`SubGraphMetadata.name` and `SubGraphMetadata.description`, when displaying
+metadata and generating code.
 
 ### Supported Input / Output types
 
@@ -51,6 +67,108 @@ Lite metadata:
 *   Bounding box - Rectangular shape bounding boxes. The schema supports
     [a variety of numbering schemes](https://github.com/tensorflow/tensorflow/blob/268853ee81edab09e07f455cc918f7ef9a421485/tensorflow/lite/experimental/support/metadata/metadata_schema.fbs#L165).
 
+### Pack the associated files
+
+TensorFlow Lite models may come with different associated files. For example,
+natural language models usually have vocab files that map word pieces to word
+IDs; classification models may have label files that indicate object categories.
+Without the associated files (if there are), a model will not function well.
+
+The associated files can now be bundled with the model through the metadata
+Python library. The new TensorFlow Lite model becomes a zip file that contains
+both the model and the associated files. It can be unpacked with common zip
+tools. This new model format keeps using the same file extension, `.tflite`. It
+is compatible with existing TFLite framework and Interpreter. See
+[Pack mtadata and associated files into the model](#pack-metadata-and-associated-files-into-the-model)
+for more details.
+
+The associate file information can be recored in the metadata. Depending on the
+file type and where the file is attached to (i.e. `ModelMetadata`,
+`SubGraphMetadata`, and `TensorMetadata`),
+[the TensorFlow Lite Android code generator](../guide/codegen.md) may apply
+corresponding pre/post processing automatically to the object. See
+[the \<Codegen usage\> section of each associate file type](https://github.com/tensorflow/tensorflow/blob/268853ee81edab09e07f455cc918f7ef9a421485/tensorflow/lite/experimental/support/metadata/metadata_schema.fbs#L37-L77)
+in the schema for more details.
+
+### Normalization and quantization parameters
+
+Normalization is a common data preprocessing technique in machine learning. The
+goal of normalization is to change the values to a common scale, without
+distorting differences in the ranges of values.
+
+[Model quantization](https://www.tensorflow.org/lite/performance/model_optimization#model_quantization)
+is a technique that allows for reduced precision representations of weights and
+optionally, activations for both storage and computation.
+
+In terms of preprocessing and post-processing, normalization and quantization
+are two independent steps. Here are the details.
+
+|                         | Normalization           | Quantization             |
+| :---------------------: | ----------------------- | ------------------------ |
+| \                       | **Float model**: \      | **Float model**: \       |
+: An example of the       : - mean\: 127.5 \        : - zeroPoint\: 0 \        :
+: parameter values of the : - std\: 127.5 \         : - scale\: 1.0 \          :
+: input image in          : **Quant model**\: \     : **Quant model**\: \      :
+: MobileNet for float and : - mean\: 127.5 \        : - zeroPoint\: 128.0 \    :
+: quant models,           : - std\: 127.5           : - scale\:0.0078125f \    :
+: respectively.           :                         :                          :
+| \                       | \                       | **Float models** does    |
+: \                       : \                       : not need quantization. \ :
+: \                       : **Inputs**\: If input   : **Quantized model** may  :
+: \                       : data is normalized in   : or may not need          :
+: When to invoke?         : training, the input     : quantization in pre/post :
+:                         : data of inference needs : processing. It depends   :
+:                         : to be normalized        : on the datatype of       :
+:                         : accordingly. \          : input/output tensors. \  :
+:                         : **Outputs**\: output    : - float tensors\: no     :
+:                         : data will not be        : quantization in pre/post :
+:                         : normalized in general.  : processing needed. Quant :
+:                         :                         : op and dequant op are    :
+:                         :                         : baked into the model     :
+:                         :                         : graph. \                 :
+:                         :                         : - int8/uint8 tensors\:   :
+:                         :                         : need quantization in     :
+:                         :                         : pre/post processing.     :
+| \                       | \                       | **Quantize for inputs**: |
+: \                       : \                       : \                        :
+: Formula                 : normalized_input =      : q = f / scale +          :
+:                         : (input - mean) / std    : zeroPoint \              :
+:                         :                         : **Dequantize for         :
+:                         :                         : outputs**\: \            :
+:                         :                         : f = (q - zeroPoint) *    :
+:                         :                         : scale                    :
+| \                       | Filled by model creator | Filled automatically by  |
+: Where are the           : and stored in model     : TFLite converter, and    :
+: parameters              : metadata, as            : stored in tflite model   :
+:                         : `NormalizationOptions`  : file.                    :
+| How to get the          | Through the             | Through the TFLite       |
+: parameters?             : `MetadataExtractor` API : `Tensor` API [1] or      :
+:                         : [2]                     : through the              :
+:                         :                         : `MetadataExtractor` API  :
+:                         :                         : [2]                      :
+| Do float and quant      | Yes, float and quant    | No, the float model does |
+: models share the same   : models have the same    : not need quantization.   :
+: value?                  : Normalization           :                          :
+:                         : parameters              :                          :
+| Does TFLite Code        | \                       | \                        |
+: generator or Android    : Yes                     : Yes                      :
+: Studio ML binding       :                         :                          :
+: automatically generate  :                         :                          :
+: it in data processing?  :                         :                          :
+
+[1] The
+[TensorFlow Lite Java API](https://github.com/tensorflow/tensorflow/blob/09ec15539eece57b257ce9074918282d88523d56/tensorflow/lite/java/src/main/java/org/tensorflow/lite/Tensor.java#L73)
+and the
+[TensorFlow Lite C++ API](https://github.com/tensorflow/tensorflow/blob/09ec15539eece57b257ce9074918282d88523d56/tensorflow/lite/c/common.h#L391).
+\
+[2] The
+[metadata extractor library](../guide/codegen.md#read-the-metadata-from-models)
+
+When processing image data for uint8 models, normalization and quantization are
+sometimes skipped. It is fine to do so when the pixel values are in the range of
+[0, 255]. But in general, you should always process the data according to the
+normalization and quantization parameters when applicable.
+
 ### Examples
 
 Note: The export directory specified has to exist before you run the script; it
@@ -63,7 +181,9 @@ types of models here:
 
 Download the script
 [here](https://github.com/tensorflow/examples/tree/master/lite/examples/image_classification/metadata/metadata_writer_for_image_classifier.py)
-and run the script like this:
+, which populates metadata to
+[mobilenet_v1_0.75_160_quantized.tflite](https://tfhub.dev/tensorflow/lite-model/mobilenet_v1_0.75_160_quantized/1/default/1).
+Run the script like this:
 
 ```sh
 python ./metadata_writer_for_image_classifier.py \
@@ -72,8 +192,11 @@ python ./metadata_writer_for_image_classifier.py \
     --export_directory=model_with_metadata
 ```
 
-The rest of this guide will highlight some of the key sections in the image
-classification example to illustrate the key elements.
+To populate metadata for other image classification models, add the model specs
+like
+[this](https://github.com/tensorflow/examples/blob/master/lite/examples/image_classification/metadata/metadata_writer_for_image_classifier.py#L63-L74)
+into the script. The rest of this guide will highlight some of the key sections
+in the image classification example to illustrate the key elements.
 
 ### Deep dive into the image classification example
 
@@ -173,7 +296,7 @@ label_file.type = _metadata_fb.AssociatedFileType.TENSOR_AXIS_LABELS
 output_meta.associatedFiles = [label_file]
 ```
 
-#### Put it all together
+#### Create the metadata Flatbuffers
 
 The following code combines the model information with the input and output
 information:
@@ -192,8 +315,10 @@ b.Finish(
 metadata_buf = b.Output()
 ```
 
-Once the data structure is ready, the metadata is written into the TFLite file
-via the `populate` method:
+#### Pack metadata and associated files into the model
+
+Once the metadata Flatbuffers is created, the metadata and the label file are
+written into the TFLite file via the `populate` method:
 
 ```python
 populator = _metadata.MetadataPopulator.with_model_file(model_file)
@@ -202,9 +327,16 @@ populator.load_associated_files(["your_path_to_label_file"])
 populator.populate()
 ```
 
-#### Verify the metadata
+You can pack as many associated files as you want into the model through
+`load_associated_files`. However, it is required to pack at least those files
+documented in the metadata. In this example, packing the lable file is
+mandatory.
 
-You can read the metadata in a TFLite file using the `MetadataDisplayer`:
+## Visualize the metadata
+
+You can use [Netron](https://github.com/lutzroeder/netron) to visualize your
+metadata, or you can read the metadata from a TensorFlow Lite model into a json
+format using the `MetadataDisplayer`:
 
 ```python
 displayer = _metadata.MetadataDisplayer.with_model_file(export_model_path)
@@ -215,3 +347,49 @@ json_file = displayer.get_metadata_json()
 with open(export_json_file, "w") as f:
   f.write(json_file)
 ```
+
+## Metadata versioning
+
+The
+[metadata schema](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/experimental/support/metadata/metadata_schema.fbs)
+is versioned both by the Semantic versioning number, which tracks the changes of
+the schema file, and by the Flatbuffers file identification, which indicates the
+true version compatibility.
+
+### The Semantic versioning number
+
+The metadata schema is versioned by the
+[Semantic versioning number](https://github.com/tensorflow/tensorflow/blob/72d30dfb8bc58be931604f853bd161a11b7c9fcc/tensorflow/lite/experimental/support/metadata/metadata_schema.fbs#L53),
+such as MAJOR.MINOR.PATCH. It tracks schema changes according to the rules
+[here](https://github.com/tensorflow/tensorflow/blob/72d30dfb8bc58be931604f853bd161a11b7c9fcc/tensorflow/lite/experimental/support/metadata/metadata_schema.fbs#L32-L44).
+See the
+[history of fields](https://github.com/tensorflow/tensorflow/blob/72d30dfb8bc58be931604f853bd161a11b7c9fcc/tensorflow/lite/experimental/support/metadata/metadata_schema.fbs#L63)
+added after version `1.0.0`.
+
+### The Flatbuffers file identification
+
+Semantic versioning guarantees the compatibility if following the rules, but it
+does not imply the true incompatibility. When bumping up the MAJOR number, it
+does not necessarily mean the backwards compatibility is broken. Therefore, we
+use the
+[Flatbuffers file identification](https://google.github.io/flatbuffers/md__schemas.html),
+[file_identifiler](https://github.com/tensorflow/tensorflow/blob/72d30dfb8bc58be931604f853bd161a11b7c9fcc/tensorflow/lite/experimental/support/metadata/metadata_schema.fbs#L61),
+to denote the true compatibility of the metadata schema. The file identifier is
+exactly 4 characters long. It is fixed to a certain metadata schema and not
+subject to change by users. If the backward compatibility of the metadata schema
+has to be broken for some reason, the file_identifier will bump up, for example,
+from “M001” to “M002”. File_identifiler is expected to be changed much less
+frequently than the metadata_version.
+
+### The minimum necessary metadata parser version
+
+The
+[minimum necessary metadata parser version](https://github.com/tensorflow/tensorflow/blob/72d30dfb8bc58be931604f853bd161a11b7c9fcc/tensorflow/lite/experimental/support/metadata/metadata_schema.fbs#L565)
+is the minimum version of metadata parser (the Flatbuffers generated code) that
+can read the metadata Flatbuffers in full. The version is effectively the
+largest version number among the versions of all the fields populated and the
+smallest compatible version indicated by the file identifier. The minimum
+necessary metadata parser version is automaticaly populated by the
+`MetadataPopulator` when the metadata is populated into a TFLite model. See the
+[metadata extractor](../guide/codegen.md#read-the-metadata-from-models) about
+how the minimum necessary metadata parser version is used.
diff --git a/tensorflow/lite/g3doc/convert/python_api.md b/tensorflow/lite/g3doc/convert/python_api.md
index ff14c4b92e7..3171306af13 100644
--- a/tensorflow/lite/g3doc/convert/python_api.md
+++ b/tensorflow/lite/g3doc/convert/python_api.md
@@ -53,7 +53,7 @@ converter = tf.lite.TFLiteConverter.from_saved_model(export_dir)
 tflite_model = converter.convert()
 
 # Save the TF Lite model.
-with tf.gfile.GFile('model.tflite', 'wb') as f:
+with tf.io.gfile.GFile('model.tflite', 'wb') as f:
   f.write(tflite_model)
 ```
 
@@ -125,7 +125,7 @@ converter = tf.lite.TFLiteConverter.from_concrete_functions([concrete_func])
 tflite_model = converter.convert()
 
 # Save the TF Lite model.
-with tf.gfile.GFile('model.tflite', 'wb') as f:
+with tf.io.gfile.GFile('model.tflite', 'wb') as f:
   f.write(tflite_model)
 ```
 
diff --git a/tensorflow/lite/g3doc/convert/rnn.md b/tensorflow/lite/g3doc/convert/rnn.md
index 52bc287c151..734992c0904 100644
--- a/tensorflow/lite/g3doc/convert/rnn.md
+++ b/tensorflow/lite/g3doc/convert/rnn.md
@@ -1,99 +1,193 @@
-# Convert RNN models
+# TensorFlow RNN conversion to TensorFlow Lite
 
-The TensorFlow Lite interpreter currently implements a subset of TensorFlow
-operations, meaning some model architectures cannot immediately be converted due
-to missing operations.
+## Overview
 
-Some RNN-based architectures are affected by this. The following document
-outlines the current state of play and provides strategies for converting RNN
-models.
+TensorFlow Lite supports converting TensorFlow RNN models to TensorFlow Lite’s
+fused LSTM operators. Fused operators exist to maximize the performance of their
+underlying kernel implementations, as well as provide a higher level interface
+to define complex transformations like quantizatization.
 
-## Currently supported
+Since there are many variants of RNN APIs in TensorFlow, our approach has been
+two fold:
 
-Currently, RNN models using
-[`tf.compat.v1.nn.static_rnn`](https://www.tensorflow.org/api_docs/python/tf/nn/static_rnn)
-can be converted successfully as long as no `sequence_length` is specified.
+1.  Provide **native support for standard TensorFlow RNN APIs** like Keras LSTM.
+    This is the recommended option.
+1.  Provide an **interface** **into the conversion infrastructure for**
+    **user-defined** **RNN implementations** to plug in and get converted to
+    TensorFlow Lite. We provide a couple of out of box examples of such
+    conversion using lingvo’s
+    [LSTMCellSimple](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/compiler/mlir/lite/transforms/prepare_composite_functions_tf.cc#L123)
+    and
+    [LayerNormalizedLSTMCellSimple](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/compiler/mlir/lite/utils/lstm_utils.cc#L519)
+    RNN interfaces.
 
-The following `tf.compat.v1.nn.rnn_cell` operations work with
-`tf.compat.v1.nn.static_rnn`:
+## Converter API
 
-*   [tf.compat.v1.nn.rnn_cell.LSTMCell](https://www.tensorflow.org/api_docs/python/tf/nn/rnn_cell/LSTMCell)
-*   [tf.compat.v1.nn.rnn_cell.RNNCell](https://www.tensorflow.org/api_docs/python/tf/nn/rnn_cell/RNNCell)
-*   [tf.compat.v1.nn.rnn_cell.GRUCell](https://www.tensorflow.org/api_docs/python/tf/nn/rnn_cell/GRUCell)
-*   [tf.compat.v1.nn.rnn_cell.BasicLSTMCell](https://www.tensorflow.org/api_docs/python/tf/nn/rnn_cell/BasicLSTMCell)
-*   [tf.compat.v1.nn.rnn_cell.BasicRNNCell](https://www.tensorflow.org/api_docs/python/tf/nn/rnn_cell/BasicRNNCell)
+Currently this feature is available through the
+[tf-nightly](https://pypi.org/project/tf-nightly/) pip or from head. This will
+be available in the TensorFlow 2.3 release.
 
-In addition, TensorFlow Lite provides some experimental drop-in replacements for
-RNN operations that enable dynamic RNN architectures with TensorFlow Lite.
+This conversion functionality is available when converting to TensorFlow Lite
+via a SavedModel or from the Keras model directly. See example usages.
 
-Drop-in replacements are available for the following:
+### From saved model
 
-*   [tf.compat.v1.nn.dynamic_rnn](https://www.tensorflow.org/api_docs/python/tf/nn/dynamic_rnn)
-*   [tf.compat.v1.nn.bidirectional_dynamic_rnn](https://www.tensorflow.org/api_docs/python/tf/nn/bidirectional_dynamic_rnn)
-*   [tf.compat.v1.nn.rnn_cell.RNNCell](https://www.tensorflow.org/api_docs/python/tf/nn/rnn_cell/RNNCell)
-*   [tf.compat.v1.nn.rnn_cell.LSTMCell](https://www.tensorflow.org/api_docs/python/tf/nn/rnn_cell/LSTMCell)
+```
+# build a saved model. Here concrete_function is the exported function
+# corresponding to the TensorFlow model containing one or more
+# Keras LSTM layers.
+saved_model, saved_model_dir = build_saved_model_lstm(...)
+saved_model.save(saved_model_dir, save_format="tf", signatures=concrete_func)
 
-## Not currently supported
+# Convert the model.
+converter = TFLiteConverter.from_saved_model(saved_model_dir)
+tflite_model = converter.convert()
+```
 
-TensorFlow Lite does not currently support
-[Control Flow](https://www.tensorflow.org/api_docs/cc/group/control-flow-ops)
-operations. This means that, unless one of the conversion strategies discussed
-in the next section are employed, models built with the following TensorFlow
-functions will not convert successfully:
+### From Keras model
 
-*   [tf.compat.v1.nn.static_rnn](https://www.tensorflow.org/api_docs/python/tf/nn/static_rnn)
-    where a `sequence_length` is specified
-*   [tf.compat.v1.nn.dynamic_rnn](https://www.tensorflow.org/api_docs/python/tf/nn/dynamic_rnn)
-*   [tf.compat.v1.nn.bidirectional_dynamic_rnn](https://www.tensorflow.org/api_docs/python/tf/nn/bidirectional_dynamic_rnn)
+```
+# build a Keras model
+keras_model = build_keras_lstm(...)
 
-Note: TensorFlow Lite plans to implement all required Control Flow operations by
-the end of 2019. At this point, all RNN architectures will convert successfully.
+# Convert the model.
+converter = TFLiteConverter.from_keras_model(keras_model)
+tflite_model = converter.convert()
 
-## Conversion strategies
+```
 
-To convert an RNN model that uses the functions specified above, you will have
-to modify its architecture and retrain it. The following strategies can be used.
+## Example
 
-### 1. Refactoring
+Keras LSTM to TensorFlow Lite
+[Colab](https://colab.research.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/lite/examples/experimental_new_converter/Keras_LSTM_fusion_Codelab.ipynb)
+illustrates the end to end usage with the TensorFlow Lite interpreter.
 
-The simplest approach, if possible, is to refactor the model architecture to use
-[tf.compat.v1.nn.static_rnn](https://www.tensorflow.org/api_docs/python/tf/nn/static_rnn)
-without `sequence_length`.
+## TensorFlow RNNs APIs supported
 
-### 2. Drop-in replacements that use op hints and fused ops
+### Keras LSTM conversion (recommended)
 
-TensorFlow Lite provides the some experimental drop-in replacements for RNN
-operations that enable dynamic RNN architectures with TensorFlow Lite. Using
-[OpHints](https://www.tensorflow.org/lite/guide/ops_custom#converting_tensorflow_models_to_convert_graphs),
-they run normally during training, but are substituted with special fused ops
-when run by the Lite interpreter.
+We support out-of-the-box conversion of Keras LSTM to TensorFlow Lite. For
+details on how this works please refer to the
+[Keras LSTM interface](https://colab.sandbox.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/lite/examples/experimental_new_converter/Keras_LSTM_fusion_Codelab.ipynb)<span style="text-decoration:space;">
+</span>and to the conversion logic
+[here](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/compiler/mlir/lite/utils/lstm_utils.cc#L627).
 
-The following drop-in replacements are available:
+Also important is to highlight the TensorFlow Lite’s LSTM contract with respect
+to the Keras operation definition:
 
-*   [tf.compat.v1.lite.experimental.nn.dynamic_rnn](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/experimental/examples/lstm/rnn.py#L41)
-    *   replacement for tf.nn.dynamic_rnn
-*   [tf.compat.v1.lite.experimental.nn.bidirectional_dynamic_rnn](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/experimental/examples/lstm/rnn.py#L279)
-    *   replacement for tf.nn.bidirectional_dynamic_rnn
-*   [tf.compat.v1.lite.experimental.nn.TfLiteRNNCell](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/experimental/examples/lstm/rnn_cell.py#L39)
-    *   replacement for tf.nn.rnn_cell.RNNCell
-*   [tf.compat.v1.lite.experimental.nn.TfLiteLSTMCell](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/experimental/examples/lstm/rnn_cell.py#L159)
-    *   replacement for tf.nn.rnn_cell.LSTMCell
+1.  The dimension 0 of the input tensor is the batch size.
+1.  The dimension 0 of the recurrent\_weight tensor is the number of outputs.
+1.  The **weight** and **recurrent\_kernel** tensors are transposed.
+1.  The transposed weight, transposed recurrent\_kernel and bias tensors are
+    split into 4 equal sized tensors along the dimension 0. These correspond to
+    **input gate, forget gate, cell, and output gate**.
 
-Note: These replacements must be used together. For example, if you are using
-`tf.compat.v1.lite.experimental.nn.dynamic_rnn`, you must combine it with
-`tf.compat.v1.lite.experimental.nn.TfLiteRNNCell` instead of using
-`tf.compat.v1.nn.rnn_cell.RNNCell`.
+#### Keras LSTM Variants
 
-Instead of
-[tf.compat.v1.nn.rnn_cell.MultiRNNCell](https://www.tensorflow.org/api_docs/python/tf/nn/rnn_cell/MultiRNNCell),
-you should use
-[tf.compat.v1.keras.layers.StackedRNNCells](https://www.tensorflow.org/api_docs/python/tf/keras/layers/StackedRNNCells).
+##### Time major
 
-For a tutorial on using these replacements, see
-[TensorFlow Lite LSTM ops API](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/experimental/examples/lstm/g3doc/README.md).
+Users may choose time-major or no time-major. Keras LSTM adds a time-major
+attribute in the function def attributes. For Unidirectional sequence LSTM, we
+can simply map to unidirecional\_sequence\_lstm's
+[time major attribute](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/compiler/mlir/lite/ir/tfl_ops.td#L3508).
 
-For a Colab demonstrating these classes, refer to
-[TensorFlowLite_LSTM_Keras_Tutorial](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/experimental/examples/lstm/TensorFlowLite_LSTM_Keras_Tutorial.ipynb).
+##### BiDirectional LSTM
 
-Note: There is no replacement available for
-[tf.compat.v1.nn.rnn_cell.GRUCell](https://www.tensorflow.org/api_docs/python/tf/nn/rnn_cell/GRUCell).
+Bidirectional LSTM can be implemented with two Keras LSTM layers, one for
+forward and one for backward, see examples
+[here](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/keras/layers/wrappers.py#L381).
+Once we see the go\_backward attribute, we recognize it as backward LSTM, then
+we group forward & backward LSTM together. **This is future work.** Currently,
+this creates two UnidirectionalSequenceLSTM operators in the TensorFlow Lite
+model.
+
+### User-defined LSTM conversion examples
+
+TensorFlow Lite also provides a way to convert user defined LSTM
+implementations. Here we use Lingvo’s LSTM as an example of how that can be
+implemented. For details please refer to the
+[lingvo.LSTMCellSimple interface](https://github.com/tensorflow/lingvo/blob/master/lingvo/core/rnn_cell.py#L230)
+and the conversion logic
+[here](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/compiler/mlir/lite/transforms/prepare_composite_functions_tf.cc#L123).
+We also provide an example for another of Lingvo’s LSTM definitions in
+[lingvo.LayerNormalizedLSTMCellSimple interface](https://github.com/tensorflow/lingvo/blob/master/lingvo/core/rnn_cell.py#L1179)
+and its convertion logic
+[here](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/compiler/mlir/lite/transforms/prepare_composite_functions_tf.cc#L130).
+
+## “Bring your own TensorFlow RNN” to TensorFlow Lite
+
+If a user's RNN interface is different from the standard supported ones, there
+are a couple of options:
+
+**Option 1:** Write adapter code in TensorFlow python to adapt the RNN interface
+to the Keras RNN interface. This means a tf.function with
+[tf\_implements annotation](https://github.com/tensorflow/community/pull/113) on
+the generated RNN interface’s function that is identical to the one generated by
+the Keras LSTM layer. After this, the same conversion API used for Keras LSTM
+will work.
+
+**Option 2:** If the above is not possible (e.g. the Keras LSTM is missing some
+functionality that is currently exposed by TensorFlow Lite’s fused LSTM op like
+layer normalization), then extend the TensorFlow Lite converter by writing
+custom conversion code and plug it into the prepare-composite-functions
+MLIR-pass
+[here](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/compiler/mlir/lite/transforms/prepare_composite_functions_tf.cc#L108).
+The function’s interface should be treated like an API contract and should
+contain the arguments needed to convert to fused TensorFlow Lite LSTM
+operators - i.e. input, bias, weights, projection, layer normalization, etc. It
+is preferable for the tensors passed as arguments to this function to have known
+rank (i.e. RankedTensorType in MLIR). This makes it much easier to write
+conversion code that can assume these tensors as RankedTensorType and helps
+transform them to ranked tensors corresponding to the fused TensorFlow Lite
+operator’s operands.
+
+A complete example of such conversion flow is Lingvo’s LSTMCellSimple to
+TensorFlow Lite conversion.
+
+The LSTMCellSimple in Lingvo is defined
+[here](https://github.com/tensorflow/lingvo/blob/master/lingvo/core/rnn_cell.py#L230).
+Models trained with this LSTM cell can be converted to TensorFlow Lite as
+follows:
+
+1.  Wrap all uses of LSTMCellSimple in a tf.function with a tf\_implements
+    annotation that is labelled as such (e.g. lingvo.LSTMCellSimple would be a
+    good annotation name here). Make sure the tf.function that is generated
+    matches the interface of the function expected in the conversion code. This
+    is a contract between the model author adding the annotation and the
+    conversion code.
+1.  Extend the prepare-composite-functions pass to plug in a custom composite op
+    to TensorFlow Lite fused LSTM op conversion. See
+    [LSTMCellSimple](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/compiler/mlir/lite/transforms/prepare_composite_functions_tf.cc#L123)
+    conversion code.
+
+    The conversion contract:
+
+1.  **Weight** and **projection** tensors are transposed.
+
+1.  The **{input, recurrent}** to **{cell, input gate, forget gate, output
+    gate}** are extracted by slicing the transposed weight tensor.
+
+1.  The **{bias}** to **{cell, input gate, forget gate, output gate}** are
+    extracted by slicing the bias tensor.
+
+1.  The **projection** is extracted by slicing the transposed projection tensor.
+
+1.  Similar conversion is written for
+    [LayerNormalizedLSTMCellSimple](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/compiler/mlir/lite/utils/lstm_utils.cc#L519).
+
+1.  The rest of the TensorFlow Lite conversion infrastructure, including all the
+    [MLIR passes](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/compiler/mlir/lite/tf_tfl_passes.cc#L58)
+    defined as well as the final export to TensorFlow Lite flatbuffer can be
+    reused.
+
+## Known issues/limitations
+
+1.  Currently there is support only for converting stateless Keras LSTM (default
+    behavior in Keras). Stateful Keras LSTM conversion is future work.
+1.  It is still possible to model a stateful Keras LSTM layer using the
+    underlying stateless Keras LSTM layer and managing the state explicitly in
+    the user program. Such a TensorFlow program can still be converted to
+    TensorFlow Lite using the feature being described here.
+1.  Bidirectional LSTM is currently modelled as two UnidirectionalSequenceLSTM
+    operators in TensorFlow Lite. This will be replaced with a single
+    BidirectionalSequenceLSTM op.
diff --git a/tensorflow/lite/g3doc/guide/codegen.md b/tensorflow/lite/g3doc/guide/codegen.md
index 74c404e61fa..cceb40b1d74 100644
--- a/tensorflow/lite/g3doc/guide/codegen.md
+++ b/tensorflow/lite/g3doc/guide/codegen.md
@@ -1,7 +1,22 @@
-# Generate code from TensorFlow Lite metadata
+# Integrate TensorFlow Lite models with metadata
 
-Note: TensorFlow Lite wrapper code generator is in experimental (beta) phase and
-currently only supports Android.
+[TensorFlow Lite metadata](../convert/metadata.md) contains a rich description
+of what the model does and how to use the model. It can empower code generators,
+such as the
+[TensorFlow Lite Android code generator](#generate-code-with-tensorflow-lite-android-code-generator)
+and the
+[Android Studio ML Binding feature](#generate-code-with-android-studio-ml-model-binding),
+to automatically generates the inference code for you. It can also be used to
+configure your custom inference pipeline.
+
+Browse
+[TensorFlow Lite hosted models](https://www.tensorflow.org/lite/guide/hosted_models)
+and [TensorFlow Hub](https://tfhub.dev/s?deployment-format=lite) to download
+pretrained models with metadata. All image models have been supported.
+
+## Generate code with TensorFlow Lite Android code generator
+
+Note: TensorFlow Lite wrapper code generator currently only supports Android.
 
 For TensorFlow Lite model enhanced with [metadata](../convert/metadata.md),
 developers can use the TensorFlow Lite Android wrapper code generator to create
@@ -15,7 +30,7 @@ under relevant fields in
 [metadata_schema.fbs](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/experimental/support/metadata/metadata_schema.fbs),
 to see how the codegen tool parses each field.
 
-## Generate Wrapper Code
+### Generate Wrapper Code
 
 You will need to install the following tooling in your terminal:
 
@@ -46,9 +61,9 @@ from google.colab import files
 files.download('classify_wrapper.zip')
 ```
 
-## Using the generated code
+### Using the generated code
 
-### Step 1: Import the generated code
+#### Step 1: Import the generated code
 
 Unzip the generated code if necessary into a directory structure. The root of
 the generated code is assumed to be `SRC_ROOT`.
@@ -60,7 +75,7 @@ select `SRC_ROOT`
 Using the above example, the directory and the module imported would be called
 `classify_wrapper`.
 
-### Step 2: Update the app's `build.gradle` file
+#### Step 2: Update the app's `build.gradle` file
 
 In the app module that will be consuming the generated library module:
 
@@ -78,7 +93,7 @@ Under the dependencies section, add the following:
 implementation project(":classify_wrapper")
 ```
 
-### Step 3: Using the model
+#### Step 3: Using the model
 
 ```java
 // 1. Initialize the model
@@ -104,7 +119,7 @@ if(null != myImageClassifier) {
 }
 ```
 
-## Accelerating model inference
+### Accelerating model inference
 
 The generated code provides a way for developers to accelerate their code
 through the use of [delegates](../performance/delegates.md) and the number of
@@ -128,9 +143,9 @@ try {
 }
 ```
 
-## Troubleshooting
+### Troubleshooting
 
-### Getting 'java.io.FileNotFoundException: This file can not be opened as a file descriptor; it is probably compressed'
+#### Getting 'java.io.FileNotFoundException: This file can not be opened as a file descriptor; it is probably compressed'
 
 Under the app module that will uses the library module, insert the following
 lines under the android section:
@@ -140,3 +155,89 @@ aaptOptions {
    noCompress "tflite"
 }
 ```
+
+## Generate code with Android Studio ML Model Binding
+
+[Android Studio ML Model Binding](https://developer.android.com/studio/preview/features#tensor-flow-lite-models)
+allows you to directly import TensorFlow Lite models and use them in your
+Android Studio projects. It generates easy-to-use classes so you can run your
+model with less code and better type safety. See the
+[introduction](https://developer.android.com/studio/preview/features#tensor-flow-lite-models)
+for more details.
+
+Note: Code generated by the TensorFlow Lite Android code generator may include
+some latest API or experimental features, which can be a super set of the one
+generated by the Android Studio ML Model Binding.
+
+## Read the metadata from models
+
+The Metadata Extractor library is a convinient tool to read the metadata and
+associated files from a models across different platforms (see the
+[Java version](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/experimental/support/metadata)
+and the C++ version is coming soon). Users can also build their own metadata
+extractor tool in other languages using the Flatbuffers library.
+
+### Read the metadata in Java
+
+Note: the Java Metadata Extractor library is available as an Android library
+dependency: `org.tensorflow:tensorflow-lite-metadata`.
+
+You can initialize a `MetadataExtractor` with a `ByteBuffer` that points to the
+model:
+
+```java
+public MetadataExtractor(ByteBuffer buffer);
+```
+
+The `ByteBuffer` must remain unchanged for the whole lifetime of the
+`MetadataExtractor`. The initialization may fail if the Flatbuffers file
+identifier of the model metadata does not match the one of the metadata parser.
+See [metadata versioning](../convert/metadata.md#metadata-versioning) for more
+information.
+
+As long as the file identifer is satisfied, the metadata extractor will not fail
+when reading metadata generated from an old or a future scheme due to the
+Flatbuffers forward and backwards compatibility mechanism. But fields from
+future shcemas cannot be extracted by older metadata extractors. The
+[minimum necessary parser version](../convert/metadata.md#the-minimum-necessary-metadata-parser-version)
+of the metadata indicates the minimum version of metadata parser that can read
+the metadata Flatbuffers in full. You can use the following method to verify if
+the minimum necessary parser version is satisfied:
+
+```java
+public final boolean isMinimumParserVersionSatisfied();
+```
+
+It is allowed to pass in a model without metadata. However, invoking methods
+that read from the metadata will cause runtime errors. You can check if a model
+has metadata by invoking the method:
+
+```java
+public boolean hasMetadata();
+```
+
+`MetadataExtractor` provides convenient functions for you to get the
+input/output tensors' metadata. For example,
+
+```java
+public int getInputTensorCount();
+public TensorMetadata getInputTensorMetadata(int inputIndex);
+public QuantizationParams getInputTensorQuantizationParams(int inputIndex);
+public int[] getInputTensorShape(int inputIndex);
+public int getoutputTensorCount();
+public TensorMetadata getoutputTensorMetadata(int inputIndex);
+public QuantizationParams getoutputTensorQuantizationParams(int inputIndex);
+public int[] getoutputTensorShape(int inputIndex);
+```
+
+You can also read associated files through their names with the method:
+
+```java
+public InputStream getAssociatedFile(String fileName);
+```
+
+Though the
+[TensorFlow Lite model schema](https://github.com/tensorflow/tensorflow/blob/aa7ff6aa28977826e7acae379e82da22482b2bf2/tensorflow/lite/schema/schema.fbs#L1075)
+supports multiple subgraphs, the TFLite Interpreter only supports single
+subgraph so far. Therefore, `MetadataExtractor` omits subgraph index as an input
+in its methods.
diff --git a/tensorflow/lite/g3doc/guide/inference.md b/tensorflow/lite/g3doc/guide/inference.md
index 5f3fba98cff..6e47d6d5190 100644
--- a/tensorflow/lite/g3doc/guide/inference.md
+++ b/tensorflow/lite/g3doc/guide/inference.md
@@ -7,9 +7,9 @@ inference with a TensorFlow Lite model, you must run it through an
 The interpreter uses a static graph ordering and a custom (less-dynamic) memory
 allocator to ensure minimal load, initialization, and execution latency.
 
-This page describes how to access to the TensorFlow Lite interpreter and
-perform an inference using C++, Java, and Python, plus links to other resources
-for each [supported platform](#supported-platforms).
+This page describes how to access to the TensorFlow Lite interpreter and perform
+an inference using C++, Java, and Python, plus links to other resources for each
+[supported platform](#supported-platforms).
 
 [TOC]
 
@@ -17,31 +17,31 @@ for each [supported platform](#supported-platforms).
 
 TensorFlow Lite inference typically follows the following steps:
 
-1. **Loading a model**
+1.  **Loading a model**
 
-   You must load the `.tflite` model into memory, which contains the model's
-   execution graph.
+    You must load the `.tflite` model into memory, which contains the model's
+    execution graph.
 
-1. **Transforming data**
+1.  **Transforming data**
 
-   Raw input data for the model generally does not match the input data format
-   expected by the model. For example, you might need to resize an image or
-   change the image format to be compatible with the model.
+    Raw input data for the model generally does not match the input data format
+    expected by the model. For example, you might need to resize an image or
+    change the image format to be compatible with the model.
 
-1. **Running inference**
+1.  **Running inference**
 
-   This step involves using the TensorFlow Lite API to execute the model. It
-   involves a few steps such as building the interpreter, and allocating
-   tensors, as described in the following sections.
+    This step involves using the TensorFlow Lite API to execute the model. It
+    involves a few steps such as building the interpreter, and allocating
+    tensors, as described in the following sections.
 
-1. **Interpreting output**
+1.  **Interpreting output**
 
-   When you receive results from the model inference, you must interpret the
-   tensors in a meaningful way that's useful in your application.
+    When you receive results from the model inference, you must interpret the
+    tensors in a meaningful way that's useful in your application.
 
-   For example, a model might return only a list of probabilities. It's up to
-   you to map the probabilities to relevant categories and present it to your
-   end-user.
+    For example, a model might return only a list of probabilities. It's up to
+    you to map the probabilities to relevant categories and present it to your
+    end-user.
 
 ## Supported platforms
 
@@ -54,8 +54,8 @@ should be no surprise that the APIs try to avoid unnecessary copies at the
 expense of convenience. Similarly, consistency with TensorFlow APIs was not an
 explicit goal and some variance between languages is to be expected.
 
-Across all libraries, the TensorFlow Lite API enables you to load models,
-feed inputs, and retrieve inference outputs.
+Across all libraries, the TensorFlow Lite API enables you to load models, feed
+inputs, and retrieve inference outputs.
 
 ### Android
 
@@ -64,8 +64,8 @@ APIs. The Java APIs provide convenience and can be used directly within your
 Android Activity classes. The C++ APIs offer more flexibility and speed, but may
 require writing JNI wrappers to move data between Java and C++ layers.
 
-See below for details about using C++ and Java, or
-follow the [Android quickstart](android.md) for a tutorial and example code.
+See below for details about using C++ and Java, or follow the
+[Android quickstart](android.md) for a tutorial and example code.
 
 #### TensorFlow Lite Android wrapper code generator
 
@@ -86,103 +86,36 @@ On iOS, TensorFlow Lite is available with native iOS libraries written in
 [Swift](https://www.tensorflow.org/code/tensorflow/lite/experimental/swift)
 and
 [Objective-C](https://www.tensorflow.org/code/tensorflow/lite/experimental/objc).
+You can also use
+[C API](https://www.tensorflow.org/code/tensorflow/lite/c/c_api.h)
+directly in Objective-C codes.
 
-This page doesn't include a discussion for about these languages, so you should
-refer to the [iOS quickstart](ios.md) for a tutorial and example code.
+See below for details about using Swift, Objective-C and C API, or follow the
+[iOS quickstart](ios.md) for a tutorial and example code.
 
 ### Linux
 
 On Linux platforms (including [Raspberry Pi](build_rpi.md)), you can run
-inferences using TensorFlow Lite APIs available in C++ and Python, as shown
-in the following sections.
+inferences using TensorFlow Lite APIs available in C++ and Python, as shown in
+the following sections.
 
+## Running a model
 
-## Load and run a model in C++
+Running a TensorFlow Lite model involves a few simple steps:
 
-Running a TensorFlow Lite model with C++ involves a few simple steps:
-
-  1. Load the model into memory as a `FlatBufferModel`.
-  2. Build an `Interpreter` based on an existing `FlatBufferModel`.
-  3. Set input tensor values. (Optionally resize input tensors if the
-     predefined sizes are not desired.)
-  4. Invoke inference.
-  5. Read output tensor values.
-
-The [`FlatBufferModel`](
-https://www.tensorflow.org/lite/api_docs/cc/class/tflite/flat-buffer-model.html)
-class encapsulates a TensorFlow Lite model and you can
-build it in a couple of different ways, depending on where the model is stored:
-
-```c++
-class FlatBufferModel {
-  // Build a model based on a file. Return a nullptr in case of failure.
-  static std::unique_ptr<FlatBufferModel> BuildFromFile(
-      const char* filename,
-      ErrorReporter* error_reporter);
-
-  // Build a model based on a pre-loaded flatbuffer. The caller retains
-  // ownership of the buffer and should keep it alive until the returned object
-  // is destroyed. Return a nullptr in case of failure.
-  static std::unique_ptr<FlatBufferModel> BuildFromBuffer(
-      const char* buffer,
-      size_t buffer_size,
-      ErrorReporter* error_reporter);
-};
-```
-
-Note: If TensorFlow Lite detects the presence of the [Android NNAPI](
-https://developer.android.com/ndk/guides/neuralnetworks), it will
-automatically try to use shared memory to store the `FlatBufferModel`.
-
-Now that you have the model as a `FlatBufferModel` object, you can execute it
-with an [`Interpreter`](
-https://www.tensorflow.org/lite/api_docs/cc/class/tflite/interpreter.html).
-A single `FlatBufferModel` can be used
-simultaneously by more than one `Interpreter`.
-
-Caution: The `FlatBufferModel` object must remain valid until
-all instances of `Interpreter` using it have been destroyed.
-
-The important parts of the `Interpreter` API are shown in the
-code snippet below. It should be noted that:
-
-  * Tensors are represented by integers, in order to avoid string comparisons
-    (and any fixed dependency on string libraries).
-  * An interpreter must not be accessed from concurrent threads.
-  * Memory allocation for input and output tensors must be triggered
-    by calling `AllocateTensors()` right after resizing tensors.
-
-The simplest usage of TensorFlow Lite with C++ looks like this:
-
-```c++
-// Load the model
-std::unique_ptr<tflite::FlatBufferModel> model =
-    tflite::FlatBufferModel::BuildFromFile(filename);
-
-// Build the interpreter
-tflite::ops::builtin::BuiltinOpResolver resolver;
-std::unique_ptr<tflite::Interpreter> interpreter;
-tflite::InterpreterBuilder(*model, resolver)(&interpreter);
-
-// Resize input tensors, if desired.
-interpreter->AllocateTensors();
-
-float* input = interpreter->typed_input_tensor<float>(0);
-// Fill `input`.
-
-interpreter->Invoke();
-
-float* output = interpreter->typed_output_tensor<float>(0);
-```
-
-For more example code, see [`minimal.cc`](
-https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/examples/minimal/minimal.cc)
-and [`label_image.cc`](
-https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/examples/label_image/label_image.cc).
+1.  Load the model into memory.
+2.  Build an `Interpreter` based on an existing model.
+3.  Set input tensor values. (Optionally resize input tensors if the predefined
+    sizes are not desired.)
+4.  Invoke inference.
+5.  Read output tensor values.
 
+Following sections describe how these steps can be done in each language.
 
 ## Load and run a model in Java
 
+*Platform: Android*
+
 The Java API for running an inference with TensorFlow Lite is primarily designed
 for use with Android, so it's available as an Android library dependency:
 `org.tensorflow:tensorflow-lite`.
@@ -203,12 +136,12 @@ public Interpreter(@NotNull MappedByteBuffer mappedByteBuffer);
 ```
 
 In both cases, you must provide a valid TensorFlow Lite model or the API throws
-`IllegalArgumentException`. If you use `MappedByteBuffer` to
-initialize an `Interpreter`, it must remain unchanged for the whole lifetime
-of the `Interpreter`.
+`IllegalArgumentException`. If you use `MappedByteBuffer` to initialize an
+`Interpreter`, it must remain unchanged for the whole lifetime of the
+`Interpreter`.
 
-To then run an inference with the model, simply call `Interpreter.run()`.
-For example:
+To then run an inference with the model, simply call `Interpreter.run()`. For
+example:
 
 ```java
 try (Interpreter interpreter = new Interpreter(file_of_a_tensorflowlite_model)) {
@@ -228,9 +161,9 @@ In this case, each entry in `inputs` corresponds to an input tensor and
 output data.
 
 In both cases, the tensor indices should correspond to the values you gave to
-the [TensorFlow Lite Converter](../convert/) when you created the model.
-Be aware that the order of tensors in `input` must match the
-order given to the TensorFlow Lite Converter.
+the [TensorFlow Lite Converter](../convert/) when you created the model. Be
+aware that the order of tensors in `input` must match the order given to the
+TensorFlow Lite Converter.
 
 The `Interpreter` class also provides convenient functions for you to get the
 index of any model input or output using an operation name:
@@ -250,8 +183,8 @@ resources must be released after use by:
 interpreter.close();
 ```
 
-For an example project with Java, see the [Android image classification sample](
-https://github.com/tensorflow/examples/tree/master/lite/examples/image_classification/android).
+For an example project with Java, see the
+[Android image classification sample](https://github.com/tensorflow/examples/tree/master/lite/examples/image_classification/android).
 
 ### Supported data types (in Java)
 
@@ -295,13 +228,231 @@ have dynamic outputs, where the shape of output tensors can vary depending on
 the input. There's no straightforward way of handling this with the existing
 Java inference API, but planned extensions will make this possible.
 
+## Load and run a model in Swift
+
+*Platform: iOS*
+
+The
+[Swift API](https://www.tensorflow.org/code/tensorflow/lite/experimental/swift)
+is available in `TensorFlowLiteSwift` Pod from Cocoapods.
+
+First, you need to import `TensorFlowLite` module.
+
+```swift
+import TensorFlowLite
+```
+
+```swift
+// Getting model path
+guard
+  let modelPath = Bundle.main.path(forResource: "model", ofType: "tflite")
+else {
+  // Error handling...
+}
+
+do {
+  // Initialize an interpreter with the model.
+  let interpreter = try Interpreter(modelPath: modelPath)
+
+  // Allocate memory for the model's input `Tensor`s.
+  try interpreter.allocateTensors()
+
+  let inputData: Data  // Should be initialized
+
+  // input data preparation...
+
+  // Copy the input data to the input `Tensor`.
+  try self.interpreter.copy(inputData, toInputAt: 0)
+
+  // Run inference by invoking the `Interpreter`.
+  try self.interpreter.invoke()
+
+  // Get the output `Tensor`
+  let outputTensor = try self.interpreter.output(at: 0)
+
+  // Copy output to `Data` to process the inference results.
+  let outputSize = outputTensor.shape.dimensions.reduce(1, {x, y in x * y})
+  let outputData =
+        UnsafeMutableBufferPointer<Float32>.allocate(capacity: outputSize)
+  outputTensor.data.copyBytes(to: outputData)
+
+  if (error != nil) { /* Error handling... */ }
+} catch error {
+  // Error handling...
+}
+```
+
+## Load and run a model in Objective-C
+
+*Platform: iOS*
+
+The
+[Objective-C API](https://www.tensorflow.org/code/tensorflow/lite/experimental/objc)
+is available in `TensorFlowLiteObjC` Pod from Cocoapods.
+
+First, you need to import `TensorFlowLite` module.
+
+```objc
+@import TensorFlowLite;
+```
+
+```objc
+NSString *modelPath = [[NSBundle mainBundle] pathForResource:@"model"
+                                                      ofType:@"tflite"];
+NSError *error;
+
+// Initialize an interpreter with the model.
+TFLInterpreter *interpreter = [[TFLInterpreter alloc] initWithModelPath:modelPath
+                                                                  error:&error];
+if (error != nil) { /* Error handling... */ }
+
+// Allocate memory for the model's input `TFLTensor`s.
+[interpreter allocateTensorsWithError:&error];
+if (error != nil) { /* Error handling... */ }
+
+NSMutableData *inputData;  // Should be initialized
+// input data preparation...
+
+// Copy the input data to the input `TFLTensor`.
+[interpreter copyData:inputData toInputTensorAtIndex:0 error:&error];
+if (error != nil) { /* Error handling... */ }
+
+// Run inference by invoking the `TFLInterpreter`.
+[interpreter invokeWithError:&error];
+if (error != nil) { /* Error handling... */ }
+
+// Get the output `TFLTensor`
+TFLTensor *outputTensor = [interpreter outputTensorAtIndex:0 error:&error];
+if (error != nil) { /* Error handling... */ }
+
+// Copy output to `NSData` to process the inference results.
+NSData *outputData = [outputTensor dataWithError:&amp;error];
+if (error != nil) { /* Error handling... */ }
+```
+
+### Using C API in Objective-C code
+
+Currently Objective-C API does not support delegates. In order to use delegates
+with Objective-C code, you need to directly call underlying
+[C API](https://www.tensorflow.org/code/tensorflow/lite/c/c_api.h).
+
+```c
+#include "tensorflow/lite/c/c_api.h"
+```
+
+```c
+TfLiteModel* model = TfLiteModelCreateFromFile([modelPath UTF8String]);
+TfLiteInterpreterOptions* options = TfLiteInterpreterOptionsCreate();
+
+// Create the interpreter.
+TfLiteInterpreter* interpreter = TfLiteInterpreterCreate(model, options);
+
+// Allocate tensors and populate the input tensor data.
+TfLiteInterpreterAllocateTensors(interpreter);
+TfLiteTensor* input_tensor =
+    TfLiteInterpreterGetInputTensor(interpreter, 0);
+TfLiteTensorCopyFromBuffer(input_tensor, input.data(),
+                           input.size() * sizeof(float));
+
+// Execute inference.
+TfLiteInterpreterInvoke(interpreter);
+
+// Extract the output tensor data.
+const TfLiteTensor* output_tensor =
+//      TfLiteInterpreterGetOutputTensor(interpreter, 0);
+TfLiteTensorCopyToBuffer(output_tensor, output.data(),
+                         output.size() * sizeof(float));
+
+// Dispose of the model and interpreter objects.
+TfLiteInterpreterDelete(interpreter);
+TfLiteInterpreterOptionsDelete(options);
+TfLiteModelDelete(model);
+```
+
+## Load and run a model in C++
+
+*Platforms: Android and Linux*
+
+In C++, the model is stored in
+[`FlatBufferModel`](https://www.tensorflow.org/lite/api_docs/cc/class/tflite/flat-buffer-model.html)
+class. It encapsulates a TensorFlow Lite model and you can build it in a couple
+of different ways, depending on where the model is stored:
+
+```c++
+class FlatBufferModel {
+  // Build a model based on a file. Return a nullptr in case of failure.
+  static std::unique_ptr<FlatBufferModel> BuildFromFile(
+      const char* filename,
+      ErrorReporter* error_reporter);
+
+  // Build a model based on a pre-loaded flatbuffer. The caller retains
+  // ownership of the buffer and should keep it alive until the returned object
+  // is destroyed. Return a nullptr in case of failure.
+  static std::unique_ptr<FlatBufferModel> BuildFromBuffer(
+      const char* buffer,
+      size_t buffer_size,
+      ErrorReporter* error_reporter);
+};
+```
+
+Note: If TensorFlow Lite detects the presence of the
+[Android NNAPI](https://developer.android.com/ndk/guides/neuralnetworks), it
+will automatically try to use shared memory to store the `FlatBufferModel`.
+
+Now that you have the model as a `FlatBufferModel` object, you can execute it
+with an
+[`Interpreter`](https://www.tensorflow.org/lite/api_docs/cc/class/tflite/interpreter.html).
+A single `FlatBufferModel` can be used simultaneously by more than one
+`Interpreter`.
+
+Caution: The `FlatBufferModel` object must remain valid until all instances of
+`Interpreter` using it have been destroyed.
+
+The important parts of the `Interpreter` API are shown in the code snippet
+below. It should be noted that:
+
+*   Tensors are represented by integers, in order to avoid string comparisons
+    (and any fixed dependency on string libraries).
+*   An interpreter must not be accessed from concurrent threads.
+*   Memory allocation for input and output tensors must be triggered by calling
+    `AllocateTensors()` right after resizing tensors.
+
+The simplest usage of TensorFlow Lite with C++ looks like this:
+
+```c++
+// Load the model
+std::unique_ptr<tflite::FlatBufferModel> model =
+    tflite::FlatBufferModel::BuildFromFile(filename);
+
+// Build the interpreter
+tflite::ops::builtin::BuiltinOpResolver resolver;
+std::unique_ptr<tflite::Interpreter> interpreter;
+tflite::InterpreterBuilder(*model, resolver)(&interpreter);
+
+// Resize input tensors, if desired.
+interpreter->AllocateTensors();
+
+float* input = interpreter->typed_input_tensor<float>(0);
+// Fill `input`.
+
+interpreter->Invoke();
+
+float* output = interpreter->typed_output_tensor<float>(0);
+```
+
+For more example code, see
+[`minimal.cc`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/examples/minimal/minimal.cc)
+and
+[`label_image.cc`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/examples/label_image/label_image.cc).
 
 ## Load and run a model in Python
 
-The Python API for running an inference is provided in the `tf.lite`
-module. From which, you mostly need only [`tf.lite.Interpreter`](
-https://www.tensorflow.org/api_docs/python/tf/lite/Interpreter) to load
-a model and run an inference.
+*Platform: Linux*
+
+The Python API for running an inference is provided in the `tf.lite` module.
+From which, you mostly need only
+[`tf.lite.Interpreter`](https://www.tensorflow.org/api_docs/python/tf/lite/Interpreter)
+to load a model and run an inference.
 
 The following example shows how to use the Python interpreter to load a
 `.tflite` file and run inference with random input data:
@@ -358,13 +509,12 @@ interpreter.allocate_tensors()
 # Continue to get tensors and so forth, as shown above...
 ```
 
-For more Python sample code, see [`label_image.py`](
-https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/examples/python/label_image.py).
+For more Python sample code, see
+[`label_image.py`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/examples/python/label_image.py).
 
 Tip: Run `help(tf.lite.Interpreter)` in the Python terminal to get detailed
 documentation about the interpreter.
 
-
 ## Write a custom operator
 
 All TensorFlow Lite operators (both custom and builtin) are defined using a
@@ -379,10 +529,10 @@ typedef struct {
 } TfLiteRegistration;
 ```
 
-Refer to `context.h` for details on `TfLiteContext` and `TfLiteNode`. The
-former provides error reporting facilities and access to global objects,
-including all the tensors. The latter allows implementations to access their
-inputs and outputs.
+Refer to `context.h` for details on `TfLiteContext` and `TfLiteNode`. The former
+provides error reporting facilities and access to global objects, including all
+the tensors. The latter allows implementations to access their inputs and
+outputs.
 
 When the interpreter loads a model, it calls `init()` once for each node in the
 graph. A given `init()` will be called more than once if the op is used multiple
@@ -403,9 +553,9 @@ implementations can access their state using `node->user_data`.
 Finally, each time inference runs, the interpreter traverses the graph calling
 `invoke()`, and here too the state is available as `node->user_data`.
 
-Custom ops can be implemented in exactly the same way as builtin ops, by
-defined those four functions and a global registration function that usually
-looks like this:
+Custom ops can be implemented in exactly the same way as builtin ops, by defined
+those four functions and a global registration function that usually looks like
+this:
 
 ```c++
 namespace tflite {
@@ -461,8 +611,7 @@ You can optionally register custom ops (before you pass the resolver to the
 resolver.AddOp("MY_CUSTOM_OP", Register_MY_CUSTOM_OP());
 ```
 
-If the set of builtin ops is deemed to be too large, a new `OpResolver` could
-be code-generated  based on a given subset of ops, possibly only the ones
-contained in a given model. This is the equivalent of TensorFlow's selective
-registration (and a simple version of it is available in the `tools`
-directory).
+If the set of builtin ops is deemed to be too large, a new `OpResolver` could be
+code-generated based on a given subset of ops, possibly only the ones contained
+in a given model. This is the equivalent of TensorFlow's selective registration
+(and a simple version of it is available in the `tools` directory).
diff --git a/tensorflow/lite/g3doc/guide/lite_support.md b/tensorflow/lite/g3doc/guide/lite_support.md
new file mode 100644
index 00000000000..826979efb19
--- /dev/null
+++ b/tensorflow/lite/g3doc/guide/lite_support.md
@@ -0,0 +1,238 @@
+# Process input and output data with the TensorFlow Lite Support Library
+
+Note: TensorFlow Lite Support Library currently only supports Android.
+
+Mobile application developers typically interact with typed objects such as
+bitmaps or primitives such as integers. However, the TensorFlow Lite Interpreter
+that runs the on-device machine learning model uses tensors in the form of
+ByteBuffer, which can be difficult to debug and manipulate. The
+[TensorFlow Lite Android Support Library](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/experimental/support/java)
+is designed to help process the input and output of TensorFlow Lite models, and
+make the TensorFlow Lite interpreter easier to use.
+
+## Getting Started
+
+### Import Gradle dependency and other settings
+
+Copy the `.tflite` model file to the assets directory of the Android module
+where the model will be run. Specify that the file should not be compressed, and
+add the TensorFlow Lite library to the module’s `build.gradle` file:
+
+```java
+android {
+    // Other settings
+
+    // Specify tflite file should not be compressed for the app apk
+    aaptOptions {
+        noCompress "tflite"
+    }
+
+}
+
+dependencies {
+    // Other dependencies
+
+    // Import tflite dependencies
+    implementation 'org.tensorflow:tensorflow-lite:0.0.0-nightly'
+    // The GPU delegate library is optional. Depend on it as needed.
+    implementation 'org.tensorflow:tensorflow-lite-gpu:0.0.0-nightly'
+    implementation 'org.tensorflow:tensorflow-lite-support:0.0.0-nightly'
+}
+```
+
+### Basic image manipulation and conversion
+
+The TensorFlow Lite Support Library has a suite of basic image manipulation
+methods such as crop and resize. To use it, create an `ImagePreprocessor` and
+add the required operations. To convert the image into the tensor format
+required by the TensorFlow Lite interpreter, create a `TensorImage` to be used
+as input:
+
+```java
+import org.tensorflow.lite.support.image.ImageProcessor;
+import org.tensorflow.lite.support.image.TensorImage;
+import org.tensorflow.lite.support.image.ops.ResizeOp;
+
+// Initialization code
+// Create an ImageProcessor with all ops required. For more ops, please
+// refer to the ImageProcessor Architecture section in this README.
+ImageProcessor imageProcessor =
+    new ImageProcessor.Builder()
+        .add(new ResizeOp(224, 224, ResizeOp.ResizeMethod.BILINEAR))
+        .build();
+
+// Create a TensorImage object. This creates the tensor of the corresponding
+// tensor type (uint8 in this case) that the TensorFlow Lite interpreter needs.
+TensorImage tImage = new TensorImage(DataType.UINT8);
+
+// Analysis code for every frame
+// Preprocess the image
+tImage.load(bitmap);
+tImage = imageProcessor.process(tImage);
+```
+
+`DataType` of a tensor can be read through the
+[metadata exractor library](../guide/codegen.md#read-the-metadata-from-models)
+as well as other model information.
+
+### Create output objects and run the model
+
+Before running the model, we need to create the container objects that will
+store the result:
+
+```java
+import org.tensorflow.lite.support.tensorbuffer.TensorBuffer;
+
+// Create a container for the result and specify that this is a quantized model.
+// Hence, the 'DataType' is defined as UINT8 (8-bit unsigned integer)
+TensorBuffer probabilityBuffer =
+    TensorBuffer.createFixedSize(new int[]{1, 1001}, DataType.UINT8);
+```
+
+Loading the model and running inference:
+
+```java
+import org.tensorflow.lite.support.model.Model;
+
+// Initialise the model
+try{
+    MappedByteBuffer tfliteModel
+        = FileUtil.loadMappedFile(activity,
+            "mobilenet_v1_1.0_224_quant.tflite");
+    Interpreter tflite = new Interpreter(tfliteModel)
+} catch (IOException e){
+    Log.e("tfliteSupport", "Error reading model", e);
+}
+
+// Running inference
+if(null != tflite) {
+    tflite.run(tImage.getBuffer(), probabilityBuffer.getBuffer());
+}
+```
+
+### Accessing the result
+
+Developers can access the output directly through
+`probabilityBuffer.getFloatArray()`. If the model produces a quantized output,
+remember to convert the result. For the MobileNet quantized model, the developer
+needs to divide each output value by 255 to obtain the probability ranging from
+0 (least likely) to 1 (most likely) for each category.
+
+### Optional: Mapping results to labels
+
+Developers can also optionally map the results to labels. First, copy the text
+file containing labels into the module’s assets directory. Next, load the label
+file using the following code:
+
+```java
+import org.tensorflow.lite.support.common.FileUtil;
+
+final String ASSOCIATED_AXIS_LABELS = "labels.txt";
+List<String> associatedAxisLabels = null;
+
+try {
+    associatedAxisLabels = FileUtil.loadLabels(this, ASSOCIATED_AXIS_LABELS);
+} catch (IOException e) {
+    Log.e("tfliteSupport", "Error reading label file", e);
+}
+```
+
+The following snippet demonstrates how to associate the probabilities with
+category labels:
+
+```java
+import org.tensorflow.lite.support.common.TensorProcessor;
+import org.tensorflow.lite.support.label.TensorLabel;
+
+// Post-processor which dequantize the result
+TensorProcessor probabilityProcessor =
+    new TensorProcessor.Builder().add(new NormalizeOp(0, 255)).build();
+
+if (null != associatedAxisLabels) {
+    // Map of labels and their corresponding probability
+    TensorLabel labels = new TensorLabel(associatedAxisLabels,
+        probabilityProcessor.process(probabilityBuffer));
+
+    // Create a map to access the result based on label
+    Map<String, Float> floatMap = labels.getMapWithFloatValue();
+}
+```
+
+## Current use-case coverage
+
+The current version of the TensorFlow Lite Support Library covers:
+
+*   common data types (float, uint8, images and array of these objects) as
+    inputs and outputs of tflite models.
+*   basic image operations (crop image, resize and rotate).
+*   normalization and quantization
+*   file utils
+
+Future versions will improve support for text-related applications.
+
+## ImageProcessor Architecture
+
+The design of the `ImageProcessor` allowed the image manipulation operations to
+be defined up front and optimised during the build process. The `ImageProcessor`
+currently supports three basic preprocessing operations:
+
+```java
+int width = bitmap.getWidth();
+int height = bitmap.getHeight();
+
+int size = height > width ? width : height;
+
+ImageProcessor imageProcessor =
+    new ImageProcessor.Builder()
+        // Center crop the image to the largest square possible
+        .add(new ResizeWithCropOrPadOp(size, size))
+        // Resize using Bilinear or Nearest neighbour
+        .add(new ResizeOp(224, 224, ResizeOp.ResizeMethod.BILINEAR));
+        // Rotation counter-clockwise in 90 degree increments
+        .add(new Rot90Op(rotateDegrees / 90))
+        .add(new NormalizeOp(127.5, 127.5))
+        .add(new QuantizeOp(128.0, 1/128.0))
+        .build();
+```
+
+See more details
+[here](../convert/metadata.md#normalization-and-quantization-parameters) about
+normalization and quantization.
+
+The eventual goal of the support library is to support all
+[`tf.image`](https://www.tensorflow.org/api_docs/python/tf/image)
+transformations. This means the transformation will be the same as TensorFlow
+and the implementation will be independent of the operating system.
+
+Developers are also welcome to create custom processors. It is important in
+these cases to be aligned with the training process - i.e. the same
+preprocessing should apply to both training and inference to increase
+reproducibility.
+
+## Quantization
+
+When initiating input or output objects such as `TensorImage` or `TensorBuffer`
+you need to specify their types to be `DataType.UINT8` or `DataType.FLOAT32`.
+
+```java
+TensorImage tImage = new TensorImage(DataType.UINT8);
+TensorBuffer probabilityBuffer =
+    TensorBuffer.createFixedSize(new int[]{1, 1001}, DataType.UINT8);
+```
+
+The `TensorProcessor` can be used to quantize input tensors or dequantize output
+tensors. For example, when processing a quantized output `TensorBuffer`, the
+developer can use `DequantizeOp` to dequantize the result to a floating point
+probability between 0 and 1:
+
+```java
+import org.tensorflow.lite.support.common.TensorProcessor;
+
+// Post-processor which dequantize the result
+TensorProcessor probabilityProcessor =
+    new TensorProcessor.Builder().add(new DequantizeOp(0, 1/255.0)).build();
+TensorBuffer dequantizedBuffer = probabilityProcessor.process(probabilityBuffer);
+```
+
+The quantization parameters of a tensor can be read through the
+[metadata exractor library](../guide/codegen.md#read-the-metadata-from-models).
diff --git a/tensorflow/lite/g3doc/guide/ops_select.md b/tensorflow/lite/g3doc/guide/ops_select.md
index 0fa608cfa96..200c0017c4b 100644
--- a/tensorflow/lite/g3doc/guide/ops_select.md
+++ b/tensorflow/lite/g3doc/guide/ops_select.md
@@ -172,38 +172,66 @@ dependencies {
 
 ### iOS
 
-With XCode Command Line Tools installed, TensorFlow Lite with select TensorFlow
-ops support can be built with the following command:
+#### Using CocoaPods
 
-```sh
-tensorflow/contrib/makefile/build_all_ios_with_tflite.sh
+We provide nightly prebuilt select TF ops CocoaPods, which you can depend on
+alongside the `TensorFlowLiteSwift` or `TensorFlowLiteObjC` CocoaPods.
+
+```ruby
+# In your Podfile target:
+  pod 'TensorFlowLiteSwift'   # or 'TensorFlowLiteObjC'
+  pod 'TensorFlowLiteSelectTfOps', '~> 0.0.1-nightly'
 ```
 
-This will generate the required static linking libraries in the
-`tensorflow/contrib/makefile/gen/lib/` directory.
+After running `pod install`, you need to provide an additional linker flag to
+force load the select TF ops framework into your project. In your Xcode project,
+go to `Build Settings` -> `Other Linker Flags`, and add:
 
-The TensorFlow Lite camera example app can be used to test this. A new
-TensorFlow Lite XCode project with support for select TensorFlow ops has been
-added to
-`tensorflow/lite/examples/ios/camera/tflite_camera_example_with_select_tf_ops.xcodeproj`.
+```text
+-force_load $(SRCROOT)/Pods/TensorFlowLiteSelectTfOps/Frameworks/TensorFlowLiteSelectTfOps.framework/TensorFlowLiteSelectTfOps
+```
 
-To use this feature in your own project, either clone the example project or set
-the project settings for a new or existing project to the following:
+You should then be able to run any models converted with the `SELECT_TF_OPS` in
+your iOS app. For example, you can modify the
+[Image Classification iOS app](https://github.com/tensorflow/examples/tree/master/lite/examples/image_classification/ios)
+to test the select TF ops feature.
 
-*   In Build Phases -> Link Binary With Libraries, add the static libraries
-    under `tensorflow/contrib/makefile/gen/lib/` directory:
-    *   `libtensorflow-lite.a`
-    *   `libprotobuf.a`
-    *   `nsync.a`
-*   In Build Settings -> Header Search Paths, add the following directories:
-    *   `tensorflow/lite/`
-    *   `tensorflow/contrib/makefile/downloads/flatbuffer/include`
-    *   `tensorflow/contrib/makefile/downloads/eigen`
-*   In Build Settings -> Other Linker Flags, add `-force_load
-    tensorflow/contrib/makefile/gen/lib/libtensorflow-lite.a`.
+*   Replace the model file with the one converted with `SELECT_TF_OPS` enabled.
+*   Add `TensorFlowLiteSelectTfOps` dependency to the `Podfile` as instructed.
+*   Add the additional linker flag as above.
+*   Run the example app and see if the model works correctly.
 
-A CocoaPod with support for select TensorFlow ops will also be released in the
-future.
+#### Using Bazel + Xcode
+
+TensorFlow Lite with select TensorFlow ops for iOS can be built using Bazel.
+First, follow the [iOS build instructions](build_ios.md) to configure your Bazel
+workspace and `.bazelrc` file correctly.
+
+Once you have configured the workspace with iOS support enabled, you can use the
+following command to build the select TF ops addon framework, which can be added
+on top of the regular `TensorFlowLiteC.framework`. Note that the select TF ops
+framework cannot be built for `i386` architecture, so you need to explicitly
+provide the list of target architectures excluding `i386`.
+
+```sh
+bazel build -c opt --config=ios --ios_multi_cpus=armv7,arm64,x86_64 \
+  //tensorflow/lite/experimental/ios:TensorFlowLiteSelectTfOps_framework
+```
+
+This will generate the framework under
+`bazel-bin/tensorflow/lite/experimental/ios/` directory. You can add this new
+framework to your Xcode project by following similar steps described under the
+[Xcode project settings](./build_ios.md#modify_xcode_project_settings_directly)
+section in the iOS build guide.
+
+After adding the framework into your app project, an additional linker flag
+should be specified in your app project to force load the select TF ops
+framework. In your Xcode project, go to `Build Settings` -> `Other Linker
+Flags`, and add:
+
+```text
+-force_load <path/to/your/TensorFlowLiteSelectTfOps.framework/TensorFlowLiteSelectTfOps>
+```
 
 ### C++
 
@@ -274,7 +302,6 @@ The following is a list of improvements to this pipeline that are in progress:
     generate TFLite interpreter binaries that only contain the TensorFlow ops
     required for a particular set of models.
 *   *Improved usability* - The conversion process will be simplified to only
-    require a single pass through the converter. Additionally, pre-built Android
-    AAR and iOS CocoaPod binaries will be provided.
+    require a single pass through the converter.
 *   *Improved performance* - Work is being done to ensure TensorFlow Lite with
     TensorFlow ops has performance parity to TensorFlow Mobile.
diff --git a/tensorflow/lite/g3doc/performance/post_training_float16_quant.ipynb b/tensorflow/lite/g3doc/performance/post_training_float16_quant.ipynb
index ef08902865e..6015d3e1a65 100644
--- a/tensorflow/lite/g3doc/performance/post_training_float16_quant.ipynb
+++ b/tensorflow/lite/g3doc/performance/post_training_float16_quant.ipynb
@@ -12,7 +12,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": 1,
       "metadata": {
         "cellView": "form",
         "colab": {},
@@ -105,7 +105,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": 2,
       "metadata": {
         "colab": {},
         "colab_type": "code",
@@ -124,13 +124,29 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": 3,
       "metadata": {
-        "colab": {},
+        "colab": {
+          "height": 34
+        },
         "colab_type": "code",
-        "id": "c6nb7OPlXs_3"
+        "id": "c6nb7OPlXs_3",
+        "outputId": "be7e4e14-cd67-4554-e928-ad803f36dad9"
       },
-      "outputs": [],
+      "outputs": [
+        {
+          "data": {
+            "text/plain": [
+              "tf.float16"
+            ]
+          },
+          "execution_count": 3,
+          "metadata": {
+            "tags": []
+          },
+          "output_type": "execute_result"
+        }
+      ],
       "source": [
         "tf.float16"
       ]
@@ -147,13 +163,39 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": 4,
       "metadata": {
-        "colab": {},
+        "colab": {
+          "height": 102
+        },
         "colab_type": "code",
-        "id": "hWSAjQWagIHl"
+        "id": "hWSAjQWagIHl",
+        "outputId": "9bf2b530-5a05-415f-f856-cab3642256e9"
       },
-      "outputs": [],
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz\n",
+            "11493376/11490434 [==============================] - 0s 0us/step\n",
+            "11501568/11490434 [==============================] - 0s 0us/step\n",
+            "1875/1875 [==============================] - 12s 6ms/step - loss: 0.2864 - accuracy: 0.9207 - val_loss: 0.1467 - val_accuracy: 0.9560\n"
+          ]
+        },
+        {
+          "data": {
+            "text/plain": [
+              "\u003ctensorflow.python.keras.callbacks.History at 0x7fcd75df46a0\u003e"
+            ]
+          },
+          "execution_count": 4,
+          "metadata": {
+            "tags": []
+          },
+          "output_type": "execute_result"
+        }
+      ],
       "source": [
         "# Load MNIST dataset\n",
         "mnist = keras.datasets.mnist\n",
@@ -211,7 +253,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": 5,
       "metadata": {
         "colab": {},
         "colab_type": "code",
@@ -235,7 +277,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": 6,
       "metadata": {
         "colab": {},
         "colab_type": "code",
@@ -249,13 +291,29 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": 7,
       "metadata": {
-        "colab": {},
+        "colab": {
+          "height": 34
+        },
         "colab_type": "code",
-        "id": "Ie9pQaQrn5ue"
+        "id": "Ie9pQaQrn5ue",
+        "outputId": "5df7381a-78ee-4f3e-e1a9-0f3a028384cf"
       },
-      "outputs": [],
+      "outputs": [
+        {
+          "data": {
+            "text/plain": [
+              "84452"
+            ]
+          },
+          "execution_count": 7,
+          "metadata": {
+            "tags": []
+          },
+          "output_type": "execute_result"
+        }
+      ],
       "source": [
         "tflite_model_file = tflite_models_dir/\"mnist_model.tflite\"\n",
         "tflite_model_file.write_bytes(tflite_model)"
@@ -273,7 +331,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": 8,
       "metadata": {
         "colab": {},
         "colab_type": "code",
@@ -297,13 +355,29 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": 9,
       "metadata": {
-        "colab": {},
+        "colab": {
+          "height": 34
+        },
         "colab_type": "code",
-        "id": "yuNfl3CoHNK3"
+        "id": "yuNfl3CoHNK3",
+        "outputId": "839f02cd-0a8c-4551-aaa3-0c05c845ad2e"
       },
-      "outputs": [],
+      "outputs": [
+        {
+          "data": {
+            "text/plain": [
+              "44272"
+            ]
+          },
+          "execution_count": 9,
+          "metadata": {
+            "tags": []
+          },
+          "output_type": "execute_result"
+        }
+      ],
       "source": [
         "tflite_fp16_model = converter.convert()\n",
         "tflite_model_fp16_file = tflite_models_dir/\"mnist_model_quant_f16.tflite\"\n",
@@ -322,13 +396,26 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": 10,
       "metadata": {
-        "colab": {},
+        "colab": {
+          "height": 68
+        },
         "colab_type": "code",
-        "id": "JExfcfLDscu4"
+        "id": "JExfcfLDscu4",
+        "outputId": "6ca316c2-cb0e-40e9-ffb1-a8bcf267e101"
       },
-      "outputs": [],
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "total 128K\n",
+            "-rw-rw-r-- 1 colaboratory-playground 50844828 44K Jun 23 06:04 mnist_model_quant_f16.tflite\n",
+            "-rw-rw-r-- 1 colaboratory-playground 50844828 83K Jun 23 06:04 mnist_model.tflite\n"
+          ]
+        }
+      ],
       "source": [
         "!ls -lh {tflite_models_dir}"
       ]
@@ -365,7 +452,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": 11,
       "metadata": {
         "colab": {},
         "colab_type": "code",
@@ -379,7 +466,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": 12,
       "metadata": {
         "colab": {},
         "colab_type": "code",
@@ -403,7 +490,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": 13,
       "metadata": {
         "colab": {},
         "colab_type": "code",
@@ -423,13 +510,29 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": 14,
       "metadata": {
-        "colab": {},
+        "colab": {
+          "height": 281
+        },
         "colab_type": "code",
-        "id": "XZClM2vo3_bm"
+        "id": "XZClM2vo3_bm",
+        "outputId": "fec12377-9f68-45a7-b4a6-ad902d8db171"
       },
-      "outputs": [],
+      "outputs": [
+        {
+          "data": {
+            "image/png": "iVBORw0KGgoAAAANSUhEUgAAAP8AAAEICAYAAACQ6CLfAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4zLCBo\ndHRwOi8vbWF0cGxvdGxpYi5vcmcvnQurowAAFxZJREFUeJzt3XtU1HXeB/D3cE0RVDSG4eKMPJBL\nIrI6ZqXhBTFrVwwpw5WEAGnLc9ZL2nbbI1arPPV4nix99jRR7aiFz7qmtIu6KhulVrJj4baYHiKI\nq6DCE4pyG7/PH51mI5nf4DAX9Pt+neM5zO/z/f2+H37ynt/M/GbmpxJCCBCRdDzc3QARuQfDTyQp\nhp9IUgw/kaQYfiJJMfxEkmL4yeF6enqgUqlQXV0NAMjOzsaGDRucPm9+fj5mzpzp9HluFgy/nYYN\nG2b55+HhgSFDhlhuv/vuu06fPzs7u1cPvr6+GDlypNPntUd+fj6effZZm+OmT5+OP/7xj07p4Ztv\nvum1v4YNGwaVSoXNmzc7Zb4bgZe7G7hRXbp0yfKzTqdDfn4+5syZY3V8T08PvLwct7vz8/ORn59v\nuZ2WloahQ4c6bPs/Zjab4enp6ZRtu0pERESv/7Ovv/4a48aNw8KFC93YlXvxyO8kzz//PB5++GEs\nXrwY/v7+2LFjB9LS0pCbm2sZc/jwYeh0Osvturo6JCcn49Zbb8XYsWOxdevWfs118eJF7NmzB+np\n6f0a/8O8L7zwAkaNGoWxY8di586dlnpaWhqWL1+OefPmwc/PD0eOHEFHRwdWr16N8PBwqNVqPPHE\nE+jo6LCsk5eXh+DgYISGhsJoNPaa76e/9/vvv4+4uDgEBAQgMjISBw8exG9/+1t8+umn+PWvf41h\nw4Zh5cqVAIBTp05hzpw5CAwMxM9+9jPs3r3bsp1z587hl7/8JQICAnDnnXeiqqqqX78/ABiNRsye\nPRvh4eH9XuemI2jAtFqtOHToUK9lzz33nPD29hYffPCBMJvN4vLly2LJkiVi3bp1ljGHDh0SWq1W\nCCFET0+PmDhxovj9738vOjs7RUVFhdBqteLw4cNCCCFKSkrEqFGj+pz/rbfeEpGRkf3u99ChQ8LT\n01OsWbNGdHR0iOLiYjFkyBBRUVEhhBBiyZIlYsSIEeKTTz4RZrNZdHR0iOXLl4sHHnhAtLS0iO++\n+07cd9994vnnnxdCCPGXv/xFBAcHi/LycnHp0iXx0EMPCQCiqqrKsr0ffu9jx46J4cOHi8OHDwuz\n2SxqamrE6dOnhRBCTJs2TbzzzjuWPtva2kRISIgwGo2iu7tbmEwmERgYaBmfkpIiUlNTRXt7uzh5\n8qQIDg4WM2bMsKw/b9488corr1zz+1+9elVotVqxffv2fu+zmxHD7wDWwj9r1qxey5TCf/ToUTF2\n7Nhe41944QWRnZ1tc/74+Hjx4osv9rvfQ4cOCW9vb9He3m5ZlpycLDZs2GDp89FHH7XUzGaz8PX1\nFdXV1ZZlH3/8seUO55FHHhHPPfecpVZeXm41/JmZmWLNmjV99vXT8O/YsUPMnDmz15jMzEzx0ksv\nia6uLuHp6Wm5wxJCiLVr1/YKvzV///vfhb+/f6/fX0Z8zu9E1/OQ8ttvv0VNTQ1GjBhhWWY2m22+\nel1VVYWjR49i27Zt19XbqFGjer1GoNVq0dDQYLn9497Pnj2Lzs5OTJw40bJM/OjzYA0NDZg2bVqv\nbVlTW1uLKVOm9KvHb7/9FseOHeu1T3p6epCRkYGmpiaYzeZefWq1WpSWltrcrtFoxEMPPeS010hu\nFAy/E6lUql63/fz8cPnyZcvts2fPWn4ODw9HVFQUvvrqq+uaY9u2bZgxY4Zi4Ppy4cIFXLlyBUOG\nDAEA1NTUQK/X99m7Wq2Gj48Pzpw5A7Vafc22NBoNamtrLbdramqszhseHo7Kyso+az/dX+Hh4UhI\nSMD+/fuvGdvd3Q0PDw/U1tYiMjLS5rw/aG9vx+7du1FUVGRz7M2OL/i5UFxcHIqKitDa2orGxka8\n9tprltpdd90FHx8fbNq0CR0dHTCbzfjyyy9x4sQJxW1u27YNGRkZ1yxPS0tDdna21fWuXr2K3Nxc\ndHV1oaSkBPv378eDDz7Y51hPT09kZ2dj5cqVOHfuHIQQqKurw8GDBwEAixYtwttvv43Tp0+jvb0d\n69evtzpvVlYW8vPz8eGHH+Lq1auoq6vDmTNnAHx/J/PNN99YxiYlJaG8vBzvvfceuru70d3djdLS\nUpw5cwbe3t544IEHsG7dOly5cgX/+te/sH37dsV9BQC7d+9GUFAQ7rnnHptjb3YMvwtlZGQgOjoa\nWq0W8+bNQ2pqqqXm5eWFffv2obS0FDqdDqNHj8Zjjz2GtrY2AEBJSUmvh78AcOTIETQ1NSElJeWa\nuWpra3s9FP+psLAw+Pn5QaPRID09Hfn5+YiKirI6ftOmTdBqtbjjjjswfPhwzJ07FxUVFQCA+fPn\nY/ny5ZgxYwZuu+02JCYmWt3O3XffjTfffBO/+c1vMHz4cMyaNcvyqGHlypUoKCjAiBEjsHr1agwf\nPhx/+9vfsGPHDmg0GgQHB+OZZ55BZ2cnAOAPf/gDWltboVarkZWVhUcffbTXXHPnzsXLL7/ca5nR\naMTSpUuveZQhI5UQ/DKPm01HRwd+/vOf48svv+zzvQWHDx9Gdna25R14JCc+578J3XLLLdf92gHJ\nhw/7iSTFh/1EkuKRn0hSLn3O76PyxS3wc+WURFLpQDu6RGe/xg4o/AcOHMCKFStgNpuRnZ2Np59+\nWnH8LfDDVFXCQKYkIgXHRXG/x9r9sN9sNmP58uXYv38/Tp06hYKCApw6dcrezRGRi9kd/tLSUkRG\nRiIiIgI+Pj5ITU1FYWGhI3sjIieyO/z19fW9PlQRFhaG+vr6a8YZDAbo9Xro9Xp0o3/PRYjI+ewO\nf19nCPt6y2ROTg5MJhNMJhO84WvvdETkYHaHPywsrNcnuerq6hASEuKQpojI+ewO/5QpU1BRUYGq\nqip0dXVh586dSEpKcmRvROREdp/q8/LywpYtW3DvvffCbDYjMzMT48ePd2RvRORELn17b4AqkOf5\niZzouChGm2jp11i+vZdIUgw/kaQYfiJJMfxEkmL4iSTF8BNJiuEnkhTDTyQphp9IUgw/kaQYfiJJ\nMfxEkmL4iSTF8BNJiuEnkhTDTyQphp9IUgw/kaQYfiJJMfxEkmL4iSTF8BNJiuEnkhTDTyQphp9I\nUgw/kaQYfiJJMfxEkmL4iSTF8BNJymsgK+t0Ovj7+8PT0xNeXl4wmUyO6ouInGxA4QeADz/8EKNH\nj3ZEL0TkQnzYTySpAYVfpVJh7ty5mDx5MgwGQ59jDAYD9Ho99Ho9utE5kOmIyIFUQghh78oNDQ0I\nCQlBc3MzEhMT8frrryM+Pt7q+ABVIKaqEuydjohsOC6K0SZa+jV2QEf+kJAQAEBQUBCSk5NRWlo6\nkM0RkQvZHf729nZcvHjR8vPBgwcRExPjsMaIyLnsfrW/qakJycnJAICenh786le/wrx58xzWGBE5\nl93hj4iIwMmTJx3ZCxG5EE/1EUmK4SeSFMNPJCmGn0hSDD+RpAb8wR5ZXFh2l9XamEe+Vlz3dLNa\nsd7V6a1YDy1Qrg+tu2S1drXslOK6JC8e+YkkxfATSYrhJ5IUw08kKYafSFIMP5GkGH4iSfE8fz89\ntfY9q7UUv1bllf9jgJPPVC5X91y2Wtt8btYAJ79xlTZrrdb8Ng1XXNer+ISj2xl0eOQnkhTDTyQp\nhp9IUgw/kaQYfiJJMfxEkmL4iSQ1oCv2XK8b+Yo97Q9OtVo7H6t8HzryK+Vd3BqtUqz7xP6fYv3l\nmPet1hKHXFFct+jyMMX6L4Za/66AgboiuhTrxzv9FOszb+m2e+7IoscU67fl/MPubbuTy67YQ0Q3\nLoafSFIMP5GkGH4iSTH8RJJi+IkkxfATSYqf5+8nvz8fV6gNbNsBA1sdrwfPtFp7aZpOee6PlK85\n8PLMSDs66h+vK1cV637/bFSsj/p4t2J9go/16x0MrVa+FoIMbB75MzMzERQUhJiYGMuylpYWJCYm\nIioqComJiWhttfFlFkQ06NgMf0ZGBg4cONBrWV5eHhISElBRUYGEhATk5eU5rUEicg6b4Y+Pj0dg\nYGCvZYWFhUhPTwcApKenY+/evc7pjoicxq7n/E1NTdBoNAAAjUaD5uZmq2MNBgMMBgMAoBud9kxH\nRE7g9Ff7c3JyYDKZYDKZ4A1fZ09HRP1kV/jVajUaG79/JbaxsRFBQUEObYqInM+u8CclJcFoNAIA\njEYjFixY4NCmiMj5bD7nX7x4MUpKSnD+/HmEhYVh/fr1ePrpp7Fo0SK89dZbGDNmDHbt2uWKXsmK\nnrNNVmt+u63XAMBsY9t+f75gR0eO0ZR9l2J9vI/yn+9/tYyzWtO9843iuj2K1ZuDzfAXFBT0uby4\nuNjhzRCR6/DtvUSSYviJJMXwE0mK4SeSFMNPJCl+pJfcxksbrljf8uwWxbq3ylOxvmvzHKu1UY2f\nKq4rAx75iSTF8BNJiuEnkhTDTyQphp9IUgw/kaQYfiJJ8Tw/uc3pVaGK9Sm+ypcuL+9Svvx44KnL\n192TTHjkJ5IUw08kKYafSFIMP5GkGH4iSTH8RJJi+IkkxfP85FSdv5hitfb5g/9tY23lKzw9vmKF\nYn3IJ6U2ti83HvmJJMXwE0mK4SeSFMNPJCmGn0hSDD+RpBh+IknxPD85Vc191o8vw1TK5/EXVyUq\n1oceOKlYF4pVsnnkz8zMRFBQEGJiYizLcnNzERoairi4OMTFxWHfvn1ObZKIHM9m+DMyMnDgwIFr\nlq9atQplZWUoKyvD/fff75TmiMh5bIY/Pj4egYGBruiFiFzI7hf8tmzZgtjYWGRmZqK1tdXqOIPB\nAL1eD71ej2502jsdETmYXeF//PHHUVlZibKyMmg0Gjz55JNWx+bk5MBkMsFkMsHbxgc1iMh17Aq/\nWq2Gp6cnPDw8sGzZMpSW8tNTRDcau8Lf2Nho+XnPnj29zgQQ0Y3B5nn+xYsXo6SkBOfPn0dYWBjW\nr1+PkpISlJWVQaVSQafT4Y033nBFrzQIefj7K9Yfueeo1Vrb1Q7FdZs3RCjWfTv/oVgnZTbDX1BQ\ncM2yrKwspzRDRK7Dt/cSSYrhJ5IUw08kKYafSFIMP5Gk+JFeGpCK3PGK9b+O/h+rtQUVKYrr+u7j\nqTxn4pGfSFIMP5GkGH4iSTH8RJJi+IkkxfATSYrhJ5IUz/OTou/S7lSs//Ph1xTrlT3dVmuX/jNM\ncV1fNCrWaWB45CeSFMNPJCmGn0hSDD+RpBh+Ikkx/ESSYviJJMXz/JLzCg1RrK/83f8q1n1Vyn9C\nqScfsVq7dT8/r+9OPPITSYrhJ5IUw08kKYafSFIMP5GkGH4iSTH8RJKyeZ6/trYWS5cuxdmzZ+Hh\n4YGcnBysWLECLS0tePjhh1FdXQ2dToc//elPGDlypCt6puug8lL+L5741zrF+kPDLijW370YpFhX\n/8768eWq4prkbDaP/F5eXti0aRO++uorfPbZZ9i6dStOnTqFvLw8JCQkoKKiAgkJCcjLy3NFv0Tk\nIDbDr9FoMGnSJACAv78/oqOjUV9fj8LCQqSnpwMA0tPTsXfvXud2SkQOdV3P+aurq/HFF19g6tSp\naGpqgkajAfD9HURzc7NTGiQi5+j3e/svXbqElJQUvPrqqwgICOj3BAaDAQaDAQDQjc7r75CInKJf\nR/7u7m6kpKRgyZIlWLhwIQBArVajsfH7L1hsbGxEUFDfL/zk5OTAZDLBZDLBG74OapuIBspm+IUQ\nyMrKQnR0NFavXm1ZnpSUBKPRCAAwGo1YsGCB87okIodTCSGE0oCjR4/innvuwYQJE+Dh8f19xYYN\nGzB16lQsWrQINTU1GDNmDHbt2oXAwEDFyQJUgZiqSnBc92STarLyJbSLPtg+oO3f/cxyxfqIbZ8O\naPt0fY6LYrSJln6Ntfmcf/r06bB2/1BcXHx9nRHRoMF3+BFJiuEnkhTDTyQphp9IUgw/kaQYfiJJ\n8au7bwKet99mtZazs3BA2779beXz+Lrtnw1o++Q+PPITSYrhJ5IUw08kKYafSFIMP5GkGH4iSTH8\nRJLief6bwOknrH9l+vyhbQPadlhJl/IA5a+DoEGMR34iSTH8RJJi+IkkxfATSYrhJ5IUw08kKYaf\nSFI8z38D6Jh/h2K9eP4mhepQxzZDNw0e+YkkxfATSYrhJ5IUw08kKYafSFIMP5GkGH4iSdk8z19b\nW4ulS5fi7Nmz8PDwQE5ODlasWIHc3Fy8+eabuPXWWwEAGzZswP333+/0hmXUMM1TsT7Gy/5z+e9e\nDFKse7cpf56fn+a/cdkMv5eXFzZt2oRJkybh4sWLmDx5MhITEwEAq1atwpo1a5zeJBE5ns3wazQa\naDQaAIC/vz+io6NRX1/v9MaIyLmu6zl/dXU1vvjiC0ydOhUAsGXLFsTGxiIzMxOtra19rmMwGKDX\n66HX69GNzoF3TEQO0e/wX7p0CSkpKXj11VcREBCAxx9/HJWVlSgrK4NGo8GTTz7Z53o5OTkwmUww\nmUzwhq/DGieigelX+Lu7u5GSkoIlS5Zg4cKFAAC1Wg1PT094eHhg2bJlKC0tdWqjRORYNsMvhEBW\nVhaio6OxevVqy/LGxkbLz3v27EFMTIxzOiQip7D5gt+xY8ewfft2TJgwAXFxcQC+P61XUFCAsrIy\nqFQq6HQ6vPHGG05vlq7fxgu3K9Y/vVenWBeNXzqwGxpMbIZ/+vTpEH18NzvP6RPd2PgOPyJJMfxE\nkmL4iSTF8BNJiuEnkhTDTyQplejrPJ6TBKgCMVWV4KrpiKRzXBSjTbT0ayyP/ESSYviJJMXwE0mK\n4SeSFMNPJCmGn0hSDD+RpFx6iW6fUR5o1VVZbp87d87y1d+DzWDtbbD2BbA3ezmyN5/q/h/PXfom\nn5/S6/UwmUzuml7RYO1tsPYFsDd7uas3PuwnkhTDTyQpz9zc3Fx3NjB58mR3Tq9osPY2WPsC2Ju9\n3NGbW5/zE5H78GE/kaQYfiJJuSX8Bw4cwLhx4xAZGYm8vDx3tGCVTqezXKNAr9e7tZfMzEwEBQX1\nuiBKS0sLEhMTERUVhcTERKvXSHRHb7m5uQgNDUVcXBzi4uKwb98+t/RWW1uLWbNmITo6GuPHj8fm\nzZsBuH/fWevLbftNuFhPT4+IiIgQlZWVorOzU8TGxory8nJXt2GVVqsV586dc3cbQgghPvroI3Hi\nxAkxfvx4y7K1a9eKjRs3CiGE2Lhxo3jqqacGTW/r1q0Tr7zyilv6+bGGhgZx4sQJIYQQbW1tIioq\nSpSXl7t931nry137zeVH/tLSUkRGRiIiIgI+Pj5ITU1FYWGhq9u4IcTHxyMwMLDXssLCQqSnpwMA\n0tPTsXfvXne01mdvg4VGo8GkSZMA9L6svLv3nbW+3MXl4a+vr0d4eLjldlhYmFt3wE+pVCrMnTsX\nkydPhsFgcHc712hqaoJGowHw/R9Tc3OzmzvqrT+XbXelH19WfjDtO3sud+9oLg+/6OPMokqlcnUb\nVh07dgyff/459u/fj61bt+Ljjz92d0s3jP5ett1VfnpZ+cHC3svdO5rLwx8WFoba2lrL7bq6OoSE\nhLi6Dat+6CUoKAjJycmD7tLjarXacoXkxsZGBAUFubmjfxtMl223dll5d++7wXS5e5eHf8qUKaio\nqEBVVRW6urqwc+dOJCUlubqNPrW3t+PixYuWnw8ePDjoLj2elJQEo9EIADAajViwYIGbO/q3wXLZ\ndmHlsvLu3nfW+nLbfnP5S4xCiKKiIhEVFSUiIiLESy+95I4W+lRZWSliY2NFbGysuP32293eW2pq\nqggODhZeXl4iNDRU5Ofni/Pnz4vZs2eLyMhIMXv2bHHhwoVB01taWpqIiYkREyZMEPPnzxcNDQ1u\n6e3IkSMCgJgwYYKYOHGimDhxoigqKnL7vrPWl7v2G9/eSyQpvsOPSFIMP5GkGH4iSTH8RJJi+Ikk\nxfATSYrhJ5LU/wOdAGX9nfSgHgAAAABJRU5ErkJggg==\n",
+            "text/plain": [
+              "\u003cFigure size 600x400 with 1 Axes\u003e"
+            ]
+          },
+          "metadata": {
+            "tags": []
+          },
+          "output_type": "display_data"
+        }
+      ],
       "source": [
         "import matplotlib.pylab as plt\n",
         "\n",
@@ -442,7 +545,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": 15,
       "metadata": {
         "colab": {},
         "colab_type": "code",
@@ -462,13 +565,29 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": 16,
       "metadata": {
-        "colab": {},
+        "colab": {
+          "height": 281
+        },
         "colab_type": "code",
-        "id": "CIH7G_MwbY2x"
+        "id": "CIH7G_MwbY2x",
+        "outputId": "6a65e499-6618-4b3e-94f6-1d12af8fb251"
       },
-      "outputs": [],
+      "outputs": [
+        {
+          "data": {
+            "image/png": "iVBORw0KGgoAAAANSUhEUgAAAP8AAAEICAYAAACQ6CLfAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4zLCBo\ndHRwOi8vbWF0cGxvdGxpYi5vcmcvnQurowAAFxZJREFUeJzt3XtU1HXeB/D3cE0RVDSG4eKMPJBL\nIrI6ZqXhBTFrVwwpw5WEAGnLc9ZL2nbbI1arPPV4nix99jRR7aiFz7qmtIu6KhulVrJj4baYHiKI\nq6DCE4pyG7/PH51mI5nf4DAX9Pt+neM5zO/z/f2+H37ynt/M/GbmpxJCCBCRdDzc3QARuQfDTyQp\nhp9IUgw/kaQYfiJJMfxEkmL4yeF6enqgUqlQXV0NAMjOzsaGDRucPm9+fj5mzpzp9HluFgy/nYYN\nG2b55+HhgSFDhlhuv/vuu06fPzs7u1cPvr6+GDlypNPntUd+fj6effZZm+OmT5+OP/7xj07p4Ztv\nvum1v4YNGwaVSoXNmzc7Zb4bgZe7G7hRXbp0yfKzTqdDfn4+5syZY3V8T08PvLwct7vz8/ORn59v\nuZ2WloahQ4c6bPs/Zjab4enp6ZRtu0pERESv/7Ovv/4a48aNw8KFC93YlXvxyO8kzz//PB5++GEs\nXrwY/v7+2LFjB9LS0pCbm2sZc/jwYeh0Osvturo6JCcn49Zbb8XYsWOxdevWfs118eJF7NmzB+np\n6f0a/8O8L7zwAkaNGoWxY8di586dlnpaWhqWL1+OefPmwc/PD0eOHEFHRwdWr16N8PBwqNVqPPHE\nE+jo6LCsk5eXh+DgYISGhsJoNPaa76e/9/vvv4+4uDgEBAQgMjISBw8exG9/+1t8+umn+PWvf41h\nw4Zh5cqVAIBTp05hzpw5CAwMxM9+9jPs3r3bsp1z587hl7/8JQICAnDnnXeiqqqqX78/ABiNRsye\nPRvh4eH9XuemI2jAtFqtOHToUK9lzz33nPD29hYffPCBMJvN4vLly2LJkiVi3bp1ljGHDh0SWq1W\nCCFET0+PmDhxovj9738vOjs7RUVFhdBqteLw4cNCCCFKSkrEqFGj+pz/rbfeEpGRkf3u99ChQ8LT\n01OsWbNGdHR0iOLiYjFkyBBRUVEhhBBiyZIlYsSIEeKTTz4RZrNZdHR0iOXLl4sHHnhAtLS0iO++\n+07cd9994vnnnxdCCPGXv/xFBAcHi/LycnHp0iXx0EMPCQCiqqrKsr0ffu9jx46J4cOHi8OHDwuz\n2SxqamrE6dOnhRBCTJs2TbzzzjuWPtva2kRISIgwGo2iu7tbmEwmERgYaBmfkpIiUlNTRXt7uzh5\n8qQIDg4WM2bMsKw/b9488corr1zz+1+9elVotVqxffv2fu+zmxHD7wDWwj9r1qxey5TCf/ToUTF2\n7Nhe41944QWRnZ1tc/74+Hjx4osv9rvfQ4cOCW9vb9He3m5ZlpycLDZs2GDp89FHH7XUzGaz8PX1\nFdXV1ZZlH3/8seUO55FHHhHPPfecpVZeXm41/JmZmWLNmjV99vXT8O/YsUPMnDmz15jMzEzx0ksv\nia6uLuHp6Wm5wxJCiLVr1/YKvzV///vfhb+/f6/fX0Z8zu9E1/OQ8ttvv0VNTQ1GjBhhWWY2m22+\nel1VVYWjR49i27Zt19XbqFGjer1GoNVq0dDQYLn9497Pnj2Lzs5OTJw40bJM/OjzYA0NDZg2bVqv\nbVlTW1uLKVOm9KvHb7/9FseOHeu1T3p6epCRkYGmpiaYzeZefWq1WpSWltrcrtFoxEMPPeS010hu\nFAy/E6lUql63/fz8cPnyZcvts2fPWn4ODw9HVFQUvvrqq+uaY9u2bZgxY4Zi4Ppy4cIFXLlyBUOG\nDAEA1NTUQK/X99m7Wq2Gj48Pzpw5A7Vafc22NBoNamtrLbdramqszhseHo7Kyso+az/dX+Hh4UhI\nSMD+/fuvGdvd3Q0PDw/U1tYiMjLS5rw/aG9vx+7du1FUVGRz7M2OL/i5UFxcHIqKitDa2orGxka8\n9tprltpdd90FHx8fbNq0CR0dHTCbzfjyyy9x4sQJxW1u27YNGRkZ1yxPS0tDdna21fWuXr2K3Nxc\ndHV1oaSkBPv378eDDz7Y51hPT09kZ2dj5cqVOHfuHIQQqKurw8GDBwEAixYtwttvv43Tp0+jvb0d\n69evtzpvVlYW8vPz8eGHH+Lq1auoq6vDmTNnAHx/J/PNN99YxiYlJaG8vBzvvfceuru70d3djdLS\nUpw5cwbe3t544IEHsG7dOly5cgX/+te/sH37dsV9BQC7d+9GUFAQ7rnnHptjb3YMvwtlZGQgOjoa\nWq0W8+bNQ2pqqqXm5eWFffv2obS0FDqdDqNHj8Zjjz2GtrY2AEBJSUmvh78AcOTIETQ1NSElJeWa\nuWpra3s9FP+psLAw+Pn5QaPRID09Hfn5+YiKirI6ftOmTdBqtbjjjjswfPhwzJ07FxUVFQCA+fPn\nY/ny5ZgxYwZuu+02JCYmWt3O3XffjTfffBO/+c1vMHz4cMyaNcvyqGHlypUoKCjAiBEjsHr1agwf\nPhx/+9vfsGPHDmg0GgQHB+OZZ55BZ2cnAOAPf/gDWltboVarkZWVhUcffbTXXHPnzsXLL7/ca5nR\naMTSpUuveZQhI5UQ/DKPm01HRwd+/vOf48svv+zzvQWHDx9Gdna25R14JCc+578J3XLLLdf92gHJ\nhw/7iSTFh/1EkuKRn0hSLn3O76PyxS3wc+WURFLpQDu6RGe/xg4o/AcOHMCKFStgNpuRnZ2Np59+\nWnH8LfDDVFXCQKYkIgXHRXG/x9r9sN9sNmP58uXYv38/Tp06hYKCApw6dcrezRGRi9kd/tLSUkRG\nRiIiIgI+Pj5ITU1FYWGhI3sjIieyO/z19fW9PlQRFhaG+vr6a8YZDAbo9Xro9Xp0o3/PRYjI+ewO\nf19nCPt6y2ROTg5MJhNMJhO84WvvdETkYHaHPywsrNcnuerq6hASEuKQpojI+ewO/5QpU1BRUYGq\nqip0dXVh586dSEpKcmRvROREdp/q8/LywpYtW3DvvffCbDYjMzMT48ePd2RvRORELn17b4AqkOf5\niZzouChGm2jp11i+vZdIUgw/kaQYfiJJMfxEkmL4iSTF8BNJiuEnkhTDTyQphp9IUgw/kaQYfiJJ\nMfxEkmL4iSTF8BNJiuEnkhTDTyQphp9IUgw/kaQYfiJJMfxEkmL4iSTF8BNJiuEnkhTDTyQphp9I\nUgw/kaQYfiJJMfxEkmL4iSTF8BNJymsgK+t0Ovj7+8PT0xNeXl4wmUyO6ouInGxA4QeADz/8EKNH\nj3ZEL0TkQnzYTySpAYVfpVJh7ty5mDx5MgwGQ59jDAYD9Ho99Ho9utE5kOmIyIFUQghh78oNDQ0I\nCQlBc3MzEhMT8frrryM+Pt7q+ABVIKaqEuydjohsOC6K0SZa+jV2QEf+kJAQAEBQUBCSk5NRWlo6\nkM0RkQvZHf729nZcvHjR8vPBgwcRExPjsMaIyLnsfrW/qakJycnJAICenh786le/wrx58xzWGBE5\nl93hj4iIwMmTJx3ZCxG5EE/1EUmK4SeSFMNPJCmGn0hSDD+RpAb8wR5ZXFh2l9XamEe+Vlz3dLNa\nsd7V6a1YDy1Qrg+tu2S1drXslOK6JC8e+YkkxfATSYrhJ5IUw08kKYafSFIMP5GkGH4iSfE8fz89\ntfY9q7UUv1bllf9jgJPPVC5X91y2Wtt8btYAJ79xlTZrrdb8Ng1XXNer+ISj2xl0eOQnkhTDTyQp\nhp9IUgw/kaQYfiJJMfxEkmL4iSQ1oCv2XK8b+Yo97Q9OtVo7H6t8HzryK+Vd3BqtUqz7xP6fYv3l\nmPet1hKHXFFct+jyMMX6L4Za/66AgboiuhTrxzv9FOszb+m2e+7IoscU67fl/MPubbuTy67YQ0Q3\nLoafSFIMP5GkGH4iSTH8RJJi+IkkxfATSYqf5+8nvz8fV6gNbNsBA1sdrwfPtFp7aZpOee6PlK85\n8PLMSDs66h+vK1cV637/bFSsj/p4t2J9go/16x0MrVa+FoIMbB75MzMzERQUhJiYGMuylpYWJCYm\nIioqComJiWhttfFlFkQ06NgMf0ZGBg4cONBrWV5eHhISElBRUYGEhATk5eU5rUEicg6b4Y+Pj0dg\nYGCvZYWFhUhPTwcApKenY+/evc7pjoicxq7n/E1NTdBoNAAAjUaD5uZmq2MNBgMMBgMAoBud9kxH\nRE7g9Ff7c3JyYDKZYDKZ4A1fZ09HRP1kV/jVajUaG79/JbaxsRFBQUEObYqInM+u8CclJcFoNAIA\njEYjFixY4NCmiMj5bD7nX7x4MUpKSnD+/HmEhYVh/fr1ePrpp7Fo0SK89dZbGDNmDHbt2uWKXsmK\nnrNNVmt+u63XAMBsY9t+f75gR0eO0ZR9l2J9vI/yn+9/tYyzWtO9843iuj2K1ZuDzfAXFBT0uby4\nuNjhzRCR6/DtvUSSYviJJMXwE0mK4SeSFMNPJCl+pJfcxksbrljf8uwWxbq3ylOxvmvzHKu1UY2f\nKq4rAx75iSTF8BNJiuEnkhTDTyQphp9IUgw/kaQYfiJJ8Tw/uc3pVaGK9Sm+ypcuL+9Svvx44KnL\n192TTHjkJ5IUw08kKYafSFIMP5GkGH4iSTH8RJJi+IkkxfP85FSdv5hitfb5g/9tY23lKzw9vmKF\nYn3IJ6U2ti83HvmJJMXwE0mK4SeSFMNPJCmGn0hSDD+RpBh+IknxPD85Vc191o8vw1TK5/EXVyUq\n1oceOKlYF4pVsnnkz8zMRFBQEGJiYizLcnNzERoairi4OMTFxWHfvn1ObZKIHM9m+DMyMnDgwIFr\nlq9atQplZWUoKyvD/fff75TmiMh5bIY/Pj4egYGBruiFiFzI7hf8tmzZgtjYWGRmZqK1tdXqOIPB\nAL1eD71ej2502jsdETmYXeF//PHHUVlZibKyMmg0Gjz55JNWx+bk5MBkMsFkMsHbxgc1iMh17Aq/\nWq2Gp6cnPDw8sGzZMpSW8tNTRDcau8Lf2Nho+XnPnj29zgQQ0Y3B5nn+xYsXo6SkBOfPn0dYWBjW\nr1+PkpISlJWVQaVSQafT4Y033nBFrzQIefj7K9Yfueeo1Vrb1Q7FdZs3RCjWfTv/oVgnZTbDX1BQ\ncM2yrKwspzRDRK7Dt/cSSYrhJ5IUw08kKYafSFIMP5Gk+JFeGpCK3PGK9b+O/h+rtQUVKYrr+u7j\nqTxn4pGfSFIMP5GkGH4iSTH8RJJi+IkkxfATSYrhJ5IUz/OTou/S7lSs//Ph1xTrlT3dVmuX/jNM\ncV1fNCrWaWB45CeSFMNPJCmGn0hSDD+RpBh+Ikkx/ESSYviJJMXz/JLzCg1RrK/83f8q1n1Vyn9C\nqScfsVq7dT8/r+9OPPITSYrhJ5IUw08kKYafSFIMP5GkGH4iSTH8RJKyeZ6/trYWS5cuxdmzZ+Hh\n4YGcnBysWLECLS0tePjhh1FdXQ2dToc//elPGDlypCt6puug8lL+L5741zrF+kPDLijW370YpFhX\n/8768eWq4prkbDaP/F5eXti0aRO++uorfPbZZ9i6dStOnTqFvLw8JCQkoKKiAgkJCcjLy3NFv0Tk\nIDbDr9FoMGnSJACAv78/oqOjUV9fj8LCQqSnpwMA0tPTsXfvXud2SkQOdV3P+aurq/HFF19g6tSp\naGpqgkajAfD9HURzc7NTGiQi5+j3e/svXbqElJQUvPrqqwgICOj3BAaDAQaDAQDQjc7r75CInKJf\nR/7u7m6kpKRgyZIlWLhwIQBArVajsfH7L1hsbGxEUFDfL/zk5OTAZDLBZDLBG74OapuIBspm+IUQ\nyMrKQnR0NFavXm1ZnpSUBKPRCAAwGo1YsGCB87okIodTCSGE0oCjR4/innvuwYQJE+Dh8f19xYYN\nGzB16lQsWrQINTU1GDNmDHbt2oXAwEDFyQJUgZiqSnBc92STarLyJbSLPtg+oO3f/cxyxfqIbZ8O\naPt0fY6LYrSJln6Ntfmcf/r06bB2/1BcXHx9nRHRoMF3+BFJiuEnkhTDTyQphp9IUgw/kaQYfiJJ\n8au7bwKet99mtZazs3BA2779beXz+Lrtnw1o++Q+PPITSYrhJ5IUw08kKYafSFIMP5GkGH4iSTH8\nRJLief6bwOknrH9l+vyhbQPadlhJl/IA5a+DoEGMR34iSTH8RJJi+IkkxfATSYrhJ5IUw08kKYaf\nSFI8z38D6Jh/h2K9eP4mhepQxzZDNw0e+YkkxfATSYrhJ5IUw08kKYafSFIMP5GkGH4iSdk8z19b\nW4ulS5fi7Nmz8PDwQE5ODlasWIHc3Fy8+eabuPXWWwEAGzZswP333+/0hmXUMM1TsT7Gy/5z+e9e\nDFKse7cpf56fn+a/cdkMv5eXFzZt2oRJkybh4sWLmDx5MhITEwEAq1atwpo1a5zeJBE5ns3wazQa\naDQaAIC/vz+io6NRX1/v9MaIyLmu6zl/dXU1vvjiC0ydOhUAsGXLFsTGxiIzMxOtra19rmMwGKDX\n66HX69GNzoF3TEQO0e/wX7p0CSkpKXj11VcREBCAxx9/HJWVlSgrK4NGo8GTTz7Z53o5OTkwmUww\nmUzwhq/DGieigelX+Lu7u5GSkoIlS5Zg4cKFAAC1Wg1PT094eHhg2bJlKC0tdWqjRORYNsMvhEBW\nVhaio6OxevVqy/LGxkbLz3v27EFMTIxzOiQip7D5gt+xY8ewfft2TJgwAXFxcQC+P61XUFCAsrIy\nqFQq6HQ6vPHGG05vlq7fxgu3K9Y/vVenWBeNXzqwGxpMbIZ/+vTpEH18NzvP6RPd2PgOPyJJMfxE\nkmL4iSTF8BNJiuEnkhTDTyQplejrPJ6TBKgCMVWV4KrpiKRzXBSjTbT0ayyP/ESSYviJJMXwE0mK\n4SeSFMNPJCmGn0hSDD+RpFx6iW6fUR5o1VVZbp87d87y1d+DzWDtbbD2BbA3ezmyN5/q/h/PXfom\nn5/S6/UwmUzuml7RYO1tsPYFsDd7uas3PuwnkhTDTyQpz9zc3Fx3NjB58mR3Tq9osPY2WPsC2Ju9\n3NGbW5/zE5H78GE/kaQYfiJJuSX8Bw4cwLhx4xAZGYm8vDx3tGCVTqezXKNAr9e7tZfMzEwEBQX1\nuiBKS0sLEhMTERUVhcTERKvXSHRHb7m5uQgNDUVcXBzi4uKwb98+t/RWW1uLWbNmITo6GuPHj8fm\nzZsBuH/fWevLbftNuFhPT4+IiIgQlZWVorOzU8TGxory8nJXt2GVVqsV586dc3cbQgghPvroI3Hi\nxAkxfvx4y7K1a9eKjRs3CiGE2Lhxo3jqqacGTW/r1q0Tr7zyilv6+bGGhgZx4sQJIYQQbW1tIioq\nSpSXl7t931nry137zeVH/tLSUkRGRiIiIgI+Pj5ITU1FYWGhq9u4IcTHxyMwMLDXssLCQqSnpwMA\n0tPTsXfvXne01mdvg4VGo8GkSZMA9L6svLv3nbW+3MXl4a+vr0d4eLjldlhYmFt3wE+pVCrMnTsX\nkydPhsFgcHc712hqaoJGowHw/R9Tc3OzmzvqrT+XbXelH19WfjDtO3sud+9oLg+/6OPMokqlcnUb\nVh07dgyff/459u/fj61bt+Ljjz92d0s3jP5ett1VfnpZ+cHC3svdO5rLwx8WFoba2lrL7bq6OoSE\nhLi6Dat+6CUoKAjJycmD7tLjarXacoXkxsZGBAUFubmjfxtMl223dll5d++7wXS5e5eHf8qUKaio\nqEBVVRW6urqwc+dOJCUlubqNPrW3t+PixYuWnw8ePDjoLj2elJQEo9EIADAajViwYIGbO/q3wXLZ\ndmHlsvLu3nfW+nLbfnP5S4xCiKKiIhEVFSUiIiLESy+95I4W+lRZWSliY2NFbGysuP32293eW2pq\nqggODhZeXl4iNDRU5Ofni/Pnz4vZs2eLyMhIMXv2bHHhwoVB01taWpqIiYkREyZMEPPnzxcNDQ1u\n6e3IkSMCgJgwYYKYOHGimDhxoigqKnL7vrPWl7v2G9/eSyQpvsOPSFIMP5GkGH4iSTH8RJJi+Ikk\nxfATSYrhJ5LU/wOdAGX9nfSgHgAAAABJRU5ErkJggg==\n",
+            "text/plain": [
+              "\u003cFigure size 600x400 with 1 Axes\u003e"
+            ]
+          },
+          "metadata": {
+            "tags": []
+          },
+          "output_type": "display_data"
+        }
+      ],
       "source": [
         "plt.imshow(test_images[0])\n",
         "template = \"True:{true}, predicted:{predict}\"\n",
@@ -489,7 +608,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": 17,
       "metadata": {
         "colab": {},
         "colab_type": "code",
@@ -531,13 +650,24 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": 18,
       "metadata": {
-        "colab": {},
+        "colab": {
+          "height": 34
+        },
         "colab_type": "code",
-        "id": "T5mWkSbMcU5z"
+        "id": "T5mWkSbMcU5z",
+        "outputId": "818e9142-70cf-420b-8e64-38c2ca11a370"
       },
-      "outputs": [],
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "0.956\n"
+          ]
+        }
+      ],
       "source": [
         "print(evaluate_model(interpreter))"
       ]
@@ -554,13 +684,24 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": 19,
       "metadata": {
-        "colab": {},
+        "colab": {
+          "height": 34
+        },
         "colab_type": "code",
-        "id": "-9cnwiPp6EGm"
+        "id": "-9cnwiPp6EGm",
+        "outputId": "53e00eac-51af-4030-be1a-3df986640f8d"
       },
-      "outputs": [],
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "0.956\n"
+          ]
+        }
+      ],
       "source": [
         "# NOTE: Colab runs on server CPUs. At the time of writing this, TensorFlow Lite\n",
         "# doesn't have super optimized server CPU kernels. For this reason this may be\n",
@@ -599,8 +740,7 @@
   "metadata": {
     "colab": {
       "collapsed_sections": [],
-      "name": "post_training-float16-quant.ipynb",
-      "private_outputs": true,
+      "name": "post_training_float16_quant.ipynb",
       "provenance": [],
       "toc_visible": true
     },
diff --git a/tensorflow/lite/g3doc/performance/post_training_integer_quant.ipynb b/tensorflow/lite/g3doc/performance/post_training_integer_quant.ipynb
index ad461f56d6f..8397dbfa69f 100644
--- a/tensorflow/lite/g3doc/performance/post_training_integer_quant.ipynb
+++ b/tensorflow/lite/g3doc/performance/post_training_integer_quant.ipynb
@@ -12,7 +12,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": 17,
       "metadata": {
         "cellView": "form",
         "colab": {},
@@ -106,7 +106,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": 18,
       "metadata": {
         "colab": {},
         "colab_type": "code",
@@ -135,13 +135,36 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": 19,
       "metadata": {
-        "colab": {},
+        "colab": {
+          "height": 51
+        },
         "colab_type": "code",
-        "id": "eMsw_6HujaqM"
+        "id": "eMsw_6HujaqM",
+        "outputId": "5662a5f3-fc64-458f-958a-98f9c6348143"
       },
-      "outputs": [],
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "1875/1875 [==============================] - 2s 1ms/step - loss: 0.2782 - accuracy: 0.9221 - val_loss: 0.1230 - val_accuracy: 0.9664\n"
+          ]
+        },
+        {
+          "data": {
+            "text/plain": [
+              "\u003ctensorflow.python.keras.callbacks.History at 0x7f33f1817588\u003e"
+            ]
+          },
+          "execution_count": 19,
+          "metadata": {
+            "tags": []
+          },
+          "output_type": "execute_result"
+        }
+      ],
       "source": [
         "# Load MNIST dataset\n",
         "mnist = keras.datasets.mnist\n",
@@ -199,7 +222,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": 20,
       "metadata": {
         "colab": {},
         "colab_type": "code",
@@ -223,7 +246,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": 21,
       "metadata": {
         "colab": {},
         "colab_type": "code",
@@ -237,13 +260,29 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": 22,
       "metadata": {
-        "colab": {},
+        "colab": {
+          "height": 34
+        },
         "colab_type": "code",
-        "id": "Ie9pQaQrn5ue"
+        "id": "Ie9pQaQrn5ue",
+        "outputId": "8580b835-61f0-42b3-a21e-b8d476042c11"
       },
-      "outputs": [],
+      "outputs": [
+        {
+          "data": {
+            "text/plain": [
+              "84528"
+            ]
+          },
+          "execution_count": 22,
+          "metadata": {
+            "tags": []
+          },
+          "output_type": "execute_result"
+        }
+      ],
       "source": [
         "tflite_model_file = tflite_models_dir/\"mnist_model.tflite\"\n",
         "tflite_model_file.write_bytes(tflite_model)"
@@ -266,7 +305,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": 23,
       "metadata": {
         "colab": {},
         "colab_type": "code",
@@ -289,7 +328,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": 24,
       "metadata": {
         "colab": {},
         "colab_type": "code",
@@ -319,13 +358,29 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": 25,
       "metadata": {
-        "colab": {},
+        "colab": {
+          "height": 34
+        },
         "colab_type": "code",
-        "id": "yuNfl3CoHNK3"
+        "id": "yuNfl3CoHNK3",
+        "outputId": "79a19679-87a2-4dc6-eee4-b33f3e5c1c5d"
       },
-      "outputs": [],
+      "outputs": [
+        {
+          "data": {
+            "text/plain": [
+              "24720"
+            ]
+          },
+          "execution_count": 25,
+          "metadata": {
+            "tags": []
+          },
+          "output_type": "execute_result"
+        }
+      ],
       "source": [
         "tflite_model_quant = converter.convert()\n",
         "tflite_model_quant_file = tflite_models_dir/\"mnist_model_quant.tflite\"\n",
@@ -344,13 +399,27 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": 26,
       "metadata": {
-        "colab": {},
+        "colab": {
+          "height": 85
+        },
         "colab_type": "code",
-        "id": "JExfcfLDscu4"
+        "id": "JExfcfLDscu4",
+        "outputId": "58238f92-01b0-4faa-e293-35451d08dd7c"
       },
-      "outputs": [],
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "total 140K\n",
+            "-rw-rw-r-- 1 yashkatariya 10086651 25K Jun 23 06:06 mnist_model_quant_io.tflite\n",
+            "-rw-rw-r-- 1 yashkatariya 10086651 25K Jun 23 06:07 mnist_model_quant.tflite\n",
+            "-rw-rw-r-- 1 yashkatariya 10086651 83K Jun 23 06:06 mnist_model.tflite\n"
+          ]
+        }
+      ],
       "source": [
         "!ls -lh {tflite_models_dir}"
       ]
@@ -369,13 +438,29 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": 27,
       "metadata": {
-        "colab": {},
+        "colab": {
+          "height": 34
+        },
         "colab_type": "code",
-        "id": "kzjEjcDs3BHa"
+        "id": "kzjEjcDs3BHa",
+        "outputId": "8d7370ec-3f3f-41a2-8afb-4ecdd40e9efc"
       },
-      "outputs": [],
+      "outputs": [
+        {
+          "data": {
+            "text/plain": [
+              "24784"
+            ]
+          },
+          "execution_count": 27,
+          "metadata": {
+            "tags": []
+          },
+          "output_type": "execute_result"
+        }
+      ],
       "source": [
         "converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]\n",
         "converter.inference_input_type = tf.uint8\n",
@@ -423,7 +508,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": 28,
       "metadata": {
         "colab": {},
         "colab_type": "code",
@@ -437,7 +522,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": 29,
       "metadata": {
         "colab": {},
         "colab_type": "code",
@@ -465,7 +550,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": 30,
       "metadata": {
         "colab": {},
         "colab_type": "code",
@@ -484,13 +569,29 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": 31,
       "metadata": {
-        "colab": {},
+        "colab": {
+          "height": 281
+        },
         "colab_type": "code",
-        "id": "XZClM2vo3_bm"
+        "id": "XZClM2vo3_bm",
+        "outputId": "3af2e31c-44c6-41f2-c51f-da9d7b71bdfb"
       },
-      "outputs": [],
+      "outputs": [
+        {
+          "data": {
+            "image/png": "iVBORw0KGgoAAAANSUhEUgAAAP8AAAEICAYAAACQ6CLfAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4zLCBo\ndHRwOi8vbWF0cGxvdGxpYi5vcmcvnQurowAAFxZJREFUeJzt3XtU1HXeB/D3cE0RVDSG4eKMPJBL\nIrI6ZqXhBTFrVwwpw5WEAGnLc9ZL2nbbI1arPPV4nix99jRR7aiFz7qmtIu6KhulVrJj4baYHiKI\nq6DCE4pyG7/PH51mI5nf4DAX9Pt+neM5zO/z/f2+H37ynt/M/GbmpxJCCBCRdDzc3QARuQfDTyQp\nhp9IUgw/kaQYfiJJMfxEkmL4yeF6enqgUqlQXV0NAMjOzsaGDRucPm9+fj5mzpzp9HluFgy/nYYN\nG2b55+HhgSFDhlhuv/vuu06fPzs7u1cPvr6+GDlypNPntUd+fj6effZZm+OmT5+OP/7xj07p4Ztv\nvum1v4YNGwaVSoXNmzc7Zb4bgZe7G7hRXbp0yfKzTqdDfn4+5syZY3V8T08PvLwct7vz8/ORn59v\nuZ2WloahQ4c6bPs/Zjab4enp6ZRtu0pERESv/7Ovv/4a48aNw8KFC93YlXvxyO8kzz//PB5++GEs\nXrwY/v7+2LFjB9LS0pCbm2sZc/jwYeh0Osvturo6JCcn49Zbb8XYsWOxdevWfs118eJF7NmzB+np\n6f0a/8O8L7zwAkaNGoWxY8di586dlnpaWhqWL1+OefPmwc/PD0eOHEFHRwdWr16N8PBwqNVqPPHE\nE+jo6LCsk5eXh+DgYISGhsJoNPaa76e/9/vvv4+4uDgEBAQgMjISBw8exG9/+1t8+umn+PWvf41h\nw4Zh5cqVAIBTp05hzpw5CAwMxM9+9jPs3r3bsp1z587hl7/8JQICAnDnnXeiqqqqX78/ABiNRsye\nPRvh4eH9XuemI2jAtFqtOHToUK9lzz33nPD29hYffPCBMJvN4vLly2LJkiVi3bp1ljGHDh0SWq1W\nCCFET0+PmDhxovj9738vOjs7RUVFhdBqteLw4cNCCCFKSkrEqFGj+pz/rbfeEpGRkf3u99ChQ8LT\n01OsWbNGdHR0iOLiYjFkyBBRUVEhhBBiyZIlYsSIEeKTTz4RZrNZdHR0iOXLl4sHHnhAtLS0iO++\n+07cd9994vnnnxdCCPGXv/xFBAcHi/LycnHp0iXx0EMPCQCiqqrKsr0ffu9jx46J4cOHi8OHDwuz\n2SxqamrE6dOnhRBCTJs2TbzzzjuWPtva2kRISIgwGo2iu7tbmEwmERgYaBmfkpIiUlNTRXt7uzh5\n8qQIDg4WM2bMsKw/b9488corr1zz+1+9elVotVqxffv2fu+zmxHD7wDWwj9r1qxey5TCf/ToUTF2\n7Nhe41944QWRnZ1tc/74+Hjx4osv9rvfQ4cOCW9vb9He3m5ZlpycLDZs2GDp89FHH7XUzGaz8PX1\nFdXV1ZZlH3/8seUO55FHHhHPPfecpVZeXm41/JmZmWLNmjV99vXT8O/YsUPMnDmz15jMzEzx0ksv\nia6uLuHp6Wm5wxJCiLVr1/YKvzV///vfhb+/f6/fX0Z8zu9E1/OQ8ttvv0VNTQ1GjBhhWWY2m22+\nel1VVYWjR49i27Zt19XbqFGjer1GoNVq0dDQYLn9497Pnj2Lzs5OTJw40bJM/OjzYA0NDZg2bVqv\nbVlTW1uLKVOm9KvHb7/9FseOHeu1T3p6epCRkYGmpiaYzeZefWq1WpSWltrcrtFoxEMPPeS010hu\nFAy/E6lUql63/fz8cPnyZcvts2fPWn4ODw9HVFQUvvrqq+uaY9u2bZgxY4Zi4Ppy4cIFXLlyBUOG\nDAEA1NTUQK/X99m7Wq2Gj48Pzpw5A7Vafc22NBoNamtrLbdramqszhseHo7Kyso+az/dX+Hh4UhI\nSMD+/fuvGdvd3Q0PDw/U1tYiMjLS5rw/aG9vx+7du1FUVGRz7M2OL/i5UFxcHIqKitDa2orGxka8\n9tprltpdd90FHx8fbNq0CR0dHTCbzfjyyy9x4sQJxW1u27YNGRkZ1yxPS0tDdna21fWuXr2K3Nxc\ndHV1oaSkBPv378eDDz7Y51hPT09kZ2dj5cqVOHfuHIQQqKurw8GDBwEAixYtwttvv43Tp0+jvb0d\n69evtzpvVlYW8vPz8eGHH+Lq1auoq6vDmTNnAHx/J/PNN99YxiYlJaG8vBzvvfceuru70d3djdLS\nUpw5cwbe3t544IEHsG7dOly5cgX/+te/sH37dsV9BQC7d+9GUFAQ7rnnHptjb3YMvwtlZGQgOjoa\nWq0W8+bNQ2pqqqXm5eWFffv2obS0FDqdDqNHj8Zjjz2GtrY2AEBJSUmvh78AcOTIETQ1NSElJeWa\nuWpra3s9FP+psLAw+Pn5QaPRID09Hfn5+YiKirI6ftOmTdBqtbjjjjswfPhwzJ07FxUVFQCA+fPn\nY/ny5ZgxYwZuu+02JCYmWt3O3XffjTfffBO/+c1vMHz4cMyaNcvyqGHlypUoKCjAiBEjsHr1agwf\nPhx/+9vfsGPHDmg0GgQHB+OZZ55BZ2cnAOAPf/gDWltboVarkZWVhUcffbTXXHPnzsXLL7/ca5nR\naMTSpUuveZQhI5UQ/DKPm01HRwd+/vOf48svv+zzvQWHDx9Gdna25R14JCc+578J3XLLLdf92gHJ\nhw/7iSTFh/1EkuKRn0hSLn3O76PyxS3wc+WURFLpQDu6RGe/xg4o/AcOHMCKFStgNpuRnZ2Np59+\nWnH8LfDDVFXCQKYkIgXHRXG/x9r9sN9sNmP58uXYv38/Tp06hYKCApw6dcrezRGRi9kd/tLSUkRG\nRiIiIgI+Pj5ITU1FYWGhI3sjIieyO/z19fW9PlQRFhaG+vr6a8YZDAbo9Xro9Xp0o3/PRYjI+ewO\nf19nCPt6y2ROTg5MJhNMJhO84WvvdETkYHaHPywsrNcnuerq6hASEuKQpojI+ewO/5QpU1BRUYGq\nqip0dXVh586dSEpKcmRvROREdp/q8/LywpYtW3DvvffCbDYjMzMT48ePd2RvRORELn17b4AqkOf5\niZzouChGm2jp11i+vZdIUgw/kaQYfiJJMfxEkmL4iSTF8BNJiuEnkhTDTyQphp9IUgw/kaQYfiJJ\nMfxEkmL4iSTF8BNJiuEnkhTDTyQphp9IUgw/kaQYfiJJMfxEkmL4iSTF8BNJiuEnkhTDTyQphp9I\nUgw/kaQYfiJJMfxEkmL4iSTF8BNJymsgK+t0Ovj7+8PT0xNeXl4wmUyO6ouInGxA4QeADz/8EKNH\nj3ZEL0TkQnzYTySpAYVfpVJh7ty5mDx5MgwGQ59jDAYD9Ho99Ho9utE5kOmIyIFUQghh78oNDQ0I\nCQlBc3MzEhMT8frrryM+Pt7q+ABVIKaqEuydjohsOC6K0SZa+jV2QEf+kJAQAEBQUBCSk5NRWlo6\nkM0RkQvZHf729nZcvHjR8vPBgwcRExPjsMaIyLnsfrW/qakJycnJAICenh786le/wrx58xzWGBE5\nl93hj4iIwMmTJx3ZCxG5EE/1EUmK4SeSFMNPJCmGn0hSDD+RpAb8wR5ZXFh2l9XamEe+Vlz3dLNa\nsd7V6a1YDy1Qrg+tu2S1drXslOK6JC8e+YkkxfATSYrhJ5IUw08kKYafSFIMP5GkGH4iSfE8fz89\ntfY9q7UUv1bllf9jgJPPVC5X91y2Wtt8btYAJ79xlTZrrdb8Ng1XXNer+ISj2xl0eOQnkhTDTyQp\nhp9IUgw/kaQYfiJJMfxEkmL4iSQ1oCv2XK8b+Yo97Q9OtVo7H6t8HzryK+Vd3BqtUqz7xP6fYv3l\nmPet1hKHXFFct+jyMMX6L4Za/66AgboiuhTrxzv9FOszb+m2e+7IoscU67fl/MPubbuTy67YQ0Q3\nLoafSFIMP5GkGH4iSTH8RJJi+IkkxfATSYqf5+8nvz8fV6gNbNsBA1sdrwfPtFp7aZpOee6PlK85\n8PLMSDs66h+vK1cV637/bFSsj/p4t2J9go/16x0MrVa+FoIMbB75MzMzERQUhJiYGMuylpYWJCYm\nIioqComJiWhttfFlFkQ06NgMf0ZGBg4cONBrWV5eHhISElBRUYGEhATk5eU5rUEicg6b4Y+Pj0dg\nYGCvZYWFhUhPTwcApKenY+/evc7pjoicxq7n/E1NTdBoNAAAjUaD5uZmq2MNBgMMBgMAoBud9kxH\nRE7g9Ff7c3JyYDKZYDKZ4A1fZ09HRP1kV/jVajUaG79/JbaxsRFBQUEObYqInM+u8CclJcFoNAIA\njEYjFixY4NCmiMj5bD7nX7x4MUpKSnD+/HmEhYVh/fr1ePrpp7Fo0SK89dZbGDNmDHbt2uWKXsmK\nnrNNVmt+u63XAMBsY9t+f75gR0eO0ZR9l2J9vI/yn+9/tYyzWtO9843iuj2K1ZuDzfAXFBT0uby4\nuNjhzRCR6/DtvUSSYviJJMXwE0mK4SeSFMNPJCl+pJfcxksbrljf8uwWxbq3ylOxvmvzHKu1UY2f\nKq4rAx75iSTF8BNJiuEnkhTDTyQphp9IUgw/kaQYfiJJ8Tw/uc3pVaGK9Sm+ypcuL+9Svvx44KnL\n192TTHjkJ5IUw08kKYafSFIMP5GkGH4iSTH8RJJi+IkkxfP85FSdv5hitfb5g/9tY23lKzw9vmKF\nYn3IJ6U2ti83HvmJJMXwE0mK4SeSFMNPJCmGn0hSDD+RpBh+IknxPD85Vc191o8vw1TK5/EXVyUq\n1oceOKlYF4pVsnnkz8zMRFBQEGJiYizLcnNzERoairi4OMTFxWHfvn1ObZKIHM9m+DMyMnDgwIFr\nlq9atQplZWUoKyvD/fff75TmiMh5bIY/Pj4egYGBruiFiFzI7hf8tmzZgtjYWGRmZqK1tdXqOIPB\nAL1eD71ej2502jsdETmYXeF//PHHUVlZibKyMmg0Gjz55JNWx+bk5MBkMsFkMsHbxgc1iMh17Aq/\nWq2Gp6cnPDw8sGzZMpSW8tNTRDcau8Lf2Nho+XnPnj29zgQQ0Y3B5nn+xYsXo6SkBOfPn0dYWBjW\nr1+PkpISlJWVQaVSQafT4Y033nBFrzQIefj7K9Yfueeo1Vrb1Q7FdZs3RCjWfTv/oVgnZTbDX1BQ\ncM2yrKwspzRDRK7Dt/cSSYrhJ5IUw08kKYafSFIMP5Gk+JFeGpCK3PGK9b+O/h+rtQUVKYrr+u7j\nqTxn4pGfSFIMP5GkGH4iSTH8RJJi+IkkxfATSYrhJ5IUz/OTou/S7lSs//Ph1xTrlT3dVmuX/jNM\ncV1fNCrWaWB45CeSFMNPJCmGn0hSDD+RpBh+Ikkx/ESSYviJJMXz/JLzCg1RrK/83f8q1n1Vyn9C\nqScfsVq7dT8/r+9OPPITSYrhJ5IUw08kKYafSFIMP5GkGH4iSTH8RJKyeZ6/trYWS5cuxdmzZ+Hh\n4YGcnBysWLECLS0tePjhh1FdXQ2dToc//elPGDlypCt6puug8lL+L5741zrF+kPDLijW370YpFhX\n/8768eWq4prkbDaP/F5eXti0aRO++uorfPbZZ9i6dStOnTqFvLw8JCQkoKKiAgkJCcjLy3NFv0Tk\nIDbDr9FoMGnSJACAv78/oqOjUV9fj8LCQqSnpwMA0tPTsXfvXud2SkQOdV3P+aurq/HFF19g6tSp\naGpqgkajAfD9HURzc7NTGiQi5+j3e/svXbqElJQUvPrqqwgICOj3BAaDAQaDAQDQjc7r75CInKJf\nR/7u7m6kpKRgyZIlWLhwIQBArVajsfH7L1hsbGxEUFDfL/zk5OTAZDLBZDLBG74OapuIBspm+IUQ\nyMrKQnR0NFavXm1ZnpSUBKPRCAAwGo1YsGCB87okIodTCSGE0oCjR4/innvuwYQJE+Dh8f19xYYN\nGzB16lQsWrQINTU1GDNmDHbt2oXAwEDFyQJUgZiqSnBc92STarLyJbSLPtg+oO3f/cxyxfqIbZ8O\naPt0fY6LYrSJln6Ntfmcf/r06bB2/1BcXHx9nRHRoMF3+BFJiuEnkhTDTyQphp9IUgw/kaQYfiJJ\n8au7bwKet99mtZazs3BA2779beXz+Lrtnw1o++Q+PPITSYrhJ5IUw08kKYafSFIMP5GkGH4iSTH8\nRJLief6bwOknrH9l+vyhbQPadlhJl/IA5a+DoEGMR34iSTH8RJJi+IkkxfATSYrhJ5IUw08kKYaf\nSFI8z38D6Jh/h2K9eP4mhepQxzZDNw0e+YkkxfATSYrhJ5IUw08kKYafSFIMP5GkGH4iSdk8z19b\nW4ulS5fi7Nmz8PDwQE5ODlasWIHc3Fy8+eabuPXWWwEAGzZswP333+/0hmXUMM1TsT7Gy/5z+e9e\nDFKse7cpf56fn+a/cdkMv5eXFzZt2oRJkybh4sWLmDx5MhITEwEAq1atwpo1a5zeJBE5ns3wazQa\naDQaAIC/vz+io6NRX1/v9MaIyLmu6zl/dXU1vvjiC0ydOhUAsGXLFsTGxiIzMxOtra19rmMwGKDX\n66HX69GNzoF3TEQO0e/wX7p0CSkpKXj11VcREBCAxx9/HJWVlSgrK4NGo8GTTz7Z53o5OTkwmUww\nmUzwhq/DGieigelX+Lu7u5GSkoIlS5Zg4cKFAAC1Wg1PT094eHhg2bJlKC0tdWqjRORYNsMvhEBW\nVhaio6OxevVqy/LGxkbLz3v27EFMTIxzOiQip7D5gt+xY8ewfft2TJgwAXFxcQC+P61XUFCAsrIy\nqFQq6HQ6vPHGG05vlq7fxgu3K9Y/vVenWBeNXzqwGxpMbIZ/+vTpEH18NzvP6RPd2PgOPyJJMfxE\nkmL4iSTF8BNJiuEnkhTDTyQplejrPJ6TBKgCMVWV4KrpiKRzXBSjTbT0ayyP/ESSYviJJMXwE0mK\n4SeSFMNPJCmGn0hSDD+RpFx6iW6fUR5o1VVZbp87d87y1d+DzWDtbbD2BbA3ezmyN5/q/h/PXfom\nn5/S6/UwmUzuml7RYO1tsPYFsDd7uas3PuwnkhTDTyQpz9zc3Fx3NjB58mR3Tq9osPY2WPsC2Ju9\n3NGbW5/zE5H78GE/kaQYfiJJuSX8Bw4cwLhx4xAZGYm8vDx3tGCVTqezXKNAr9e7tZfMzEwEBQX1\nuiBKS0sLEhMTERUVhcTERKvXSHRHb7m5uQgNDUVcXBzi4uKwb98+t/RWW1uLWbNmITo6GuPHj8fm\nzZsBuH/fWevLbftNuFhPT4+IiIgQlZWVorOzU8TGxory8nJXt2GVVqsV586dc3cbQgghPvroI3Hi\nxAkxfvx4y7K1a9eKjRs3CiGE2Lhxo3jqqacGTW/r1q0Tr7zyilv6+bGGhgZx4sQJIYQQbW1tIioq\nSpSXl7t931nry137zeVH/tLSUkRGRiIiIgI+Pj5ITU1FYWGhq9u4IcTHxyMwMLDXssLCQqSnpwMA\n0tPTsXfvXne01mdvg4VGo8GkSZMA9L6svLv3nbW+3MXl4a+vr0d4eLjldlhYmFt3wE+pVCrMnTsX\nkydPhsFgcHc712hqaoJGowHw/R9Tc3OzmzvqrT+XbXelH19WfjDtO3sud+9oLg+/6OPMokqlcnUb\nVh07dgyff/459u/fj61bt+Ljjz92d0s3jP5ett1VfnpZ+cHC3svdO5rLwx8WFoba2lrL7bq6OoSE\nhLi6Dat+6CUoKAjJycmD7tLjarXacoXkxsZGBAUFubmjfxtMl223dll5d++7wXS5e5eHf8qUKaio\nqEBVVRW6urqwc+dOJCUlubqNPrW3t+PixYuWnw8ePDjoLj2elJQEo9EIADAajViwYIGbO/q3wXLZ\ndmHlsvLu3nfW+nLbfnP5S4xCiKKiIhEVFSUiIiLESy+95I4W+lRZWSliY2NFbGysuP32293eW2pq\nqggODhZeXl4iNDRU5Ofni/Pnz4vZs2eLyMhIMXv2bHHhwoVB01taWpqIiYkREyZMEPPnzxcNDQ1u\n6e3IkSMCgJgwYYKYOHGimDhxoigqKnL7vrPWl7v2G9/eSyQpvsOPSFIMP5GkGH4iSTH8RJJi+Ikk\nxfATSYrhJ5LU/wOdAGX9nfSgHgAAAABJRU5ErkJggg==\n",
+            "text/plain": [
+              "\u003cFigure size 600x400 with 1 Axes\u003e"
+            ]
+          },
+          "metadata": {
+            "tags": []
+          },
+          "output_type": "display_data"
+        }
+      ],
       "source": [
         "import matplotlib.pylab as plt\n",
         "\n",
@@ -513,7 +614,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": 33,
       "metadata": {
         "colab": {},
         "colab_type": "code",
@@ -530,7 +631,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
       "metadata": {
         "colab": {},
         "colab_type": "code",
@@ -557,7 +658,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
       "metadata": {
         "colab": {},
         "colab_type": "code",
@@ -599,7 +700,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
       "metadata": {
         "colab": {},
         "colab_type": "code",
@@ -622,7 +723,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
       "metadata": {
         "colab": {},
         "colab_type": "code",
@@ -653,7 +754,6 @@
     "colab": {
       "collapsed_sections": [],
       "name": "post_training_integer_quant.ipynb",
-      "private_outputs": true,
       "provenance": [],
       "toc_visible": true
     },
diff --git a/tensorflow/lite/g3doc/performance/post_training_quant.ipynb b/tensorflow/lite/g3doc/performance/post_training_quant.ipynb
index 201ccf5bdc3..5341fe5e4fb 100644
--- a/tensorflow/lite/g3doc/performance/post_training_quant.ipynb
+++ b/tensorflow/lite/g3doc/performance/post_training_quant.ipynb
@@ -12,7 +12,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": 1,
       "metadata": {
         "cellView": "form",
         "colab": {},
@@ -126,7 +126,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": 2,
       "metadata": {
         "colab": {},
         "colab_type": "code",
@@ -155,13 +155,36 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": 3,
       "metadata": {
-        "colab": {},
+        "colab": {
+          "height": 51
+        },
         "colab_type": "code",
-        "id": "hWSAjQWagIHl"
+        "id": "hWSAjQWagIHl",
+        "outputId": "961899f8-1597-4417-b21d-cae94a330ecc"
       },
-      "outputs": [],
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "1875/1875 [==============================] - 10s 5ms/step - loss: 0.2787 - accuracy: 0.9203 - val_loss: 0.1323 - val_accuracy: 0.9624\n"
+          ]
+        },
+        {
+          "data": {
+            "text/plain": [
+              "\u003ctensorflow.python.keras.callbacks.History at 0x7f6443480e80\u003e"
+            ]
+          },
+          "execution_count": 3,
+          "metadata": {
+            "tags": []
+          },
+          "output_type": "execute_result"
+        }
+      ],
       "source": [
         "# Load MNIST dataset\n",
         "mnist = keras.datasets.mnist\n",
@@ -200,8 +223,7 @@
         "id": "5NMaNZQCkW9X"
       },
       "source": [
-        "For the example, since you trained the model for just a single epoch, so it only trains to ~96% accuracy.\n",
-        "\n"
+        "For the example, since you trained the model for just a single epoch, so it only trains to ~96% accuracy.\n"
       ]
     },
     {
@@ -220,7 +242,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": 4,
       "metadata": {
         "colab": {},
         "colab_type": "code",
@@ -244,7 +266,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": 5,
       "metadata": {
         "colab": {},
         "colab_type": "code",
@@ -258,13 +280,29 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": 6,
       "metadata": {
-        "colab": {},
+        "colab": {
+          "height": 34
+        },
         "colab_type": "code",
-        "id": "Ie9pQaQrn5ue"
+        "id": "Ie9pQaQrn5ue",
+        "outputId": "046db0bc-1745-4e94-9f21-f7e91bdaebda"
       },
-      "outputs": [],
+      "outputs": [
+        {
+          "data": {
+            "text/plain": [
+              "84452"
+            ]
+          },
+          "execution_count": 6,
+          "metadata": {
+            "tags": []
+          },
+          "output_type": "execute_result"
+        }
+      ],
       "source": [
         "tflite_model_file = tflite_models_dir/\"mnist_model.tflite\"\n",
         "tflite_model_file.write_bytes(tflite_model)"
@@ -282,13 +320,29 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": 7,
       "metadata": {
-        "colab": {},
+        "colab": {
+          "height": 34
+        },
         "colab_type": "code",
-        "id": "g8PUvLWDlmmz"
+        "id": "g8PUvLWDlmmz",
+        "outputId": "d79b45d3-babf-4890-8036-de2f497da88a"
       },
-      "outputs": [],
+      "outputs": [
+        {
+          "data": {
+            "text/plain": [
+              "23840"
+            ]
+          },
+          "execution_count": 7,
+          "metadata": {
+            "tags": []
+          },
+          "output_type": "execute_result"
+        }
+      ],
       "source": [
         "converter.optimizations = [tf.lite.Optimize.DEFAULT]\n",
         "tflite_quant_model = converter.convert()\n",
@@ -308,13 +362,29 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": 8,
       "metadata": {
-        "colab": {},
+        "colab": {
+          "height": 119
+        },
         "colab_type": "code",
-        "id": "JExfcfLDscu4"
+        "id": "JExfcfLDscu4",
+        "outputId": "d1fda4c2-343e-40fb-f90f-b6bde00c523e"
       },
-      "outputs": [],
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "total 214M\n",
+            "-rw-rw-r-- 1 colaboratory-playground 50844828  44K Jun 23 06:04 mnist_model_quant_f16.tflite\n",
+            "-rw-rw-r-- 1 colaboratory-playground 50844828  24K Jun 23 06:12 mnist_model_quant.tflite\n",
+            "-rw-rw-r-- 1 colaboratory-playground 50844828  83K Jun 23 06:12 mnist_model.tflite\n",
+            "-rw-rw-r-- 1 colaboratory-playground 50844828  44M Jun 23 06:10 resnet_v2_101_quantized.tflite\n",
+            "-rw-rw-r-- 1 colaboratory-playground 50844828 171M Jun 23 06:09 resnet_v2_101.tflite\n"
+          ]
+        }
+      ],
       "source": [
         "!ls -lh {tflite_models_dir}"
       ]
@@ -329,8 +399,7 @@
         "## Run the TFLite models\n",
         "\n",
         "Run the TensorFlow Lite model using the Python TensorFlow Lite\n",
-        "Interpreter.\n",
-        "\n"
+        "Interpreter.\n"
       ]
     },
     {
@@ -345,7 +414,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": 9,
       "metadata": {
         "colab": {},
         "colab_type": "code",
@@ -359,7 +428,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": 10,
       "metadata": {
         "colab": {},
         "colab_type": "code",
@@ -383,7 +452,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": 11,
       "metadata": {
         "colab": {},
         "colab_type": "code",
@@ -403,13 +472,29 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": 12,
       "metadata": {
-        "colab": {},
+        "colab": {
+          "height": 281
+        },
         "colab_type": "code",
-        "id": "XZClM2vo3_bm"
+        "id": "XZClM2vo3_bm",
+        "outputId": "0fa4155b-01f8-4fea-f586-d9044d73572e"
       },
-      "outputs": [],
+      "outputs": [
+        {
+          "data": {
+            "image/png": "iVBORw0KGgoAAAANSUhEUgAAAP8AAAEICAYAAACQ6CLfAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4zLCBo\ndHRwOi8vbWF0cGxvdGxpYi5vcmcvnQurowAAFxZJREFUeJzt3XtU1HXeB/D3cE0RVDSG4eKMPJBL\nIrI6ZqXhBTFrVwwpw5WEAGnLc9ZL2nbbI1arPPV4nix99jRR7aiFz7qmtIu6KhulVrJj4baYHiKI\nq6DCE4pyG7/PH51mI5nf4DAX9Pt+neM5zO/z/f2+H37ynt/M/GbmpxJCCBCRdDzc3QARuQfDTyQp\nhp9IUgw/kaQYfiJJMfxEkmL4yeF6enqgUqlQXV0NAMjOzsaGDRucPm9+fj5mzpzp9HluFgy/nYYN\nG2b55+HhgSFDhlhuv/vuu06fPzs7u1cPvr6+GDlypNPntUd+fj6effZZm+OmT5+OP/7xj07p4Ztv\nvum1v4YNGwaVSoXNmzc7Zb4bgZe7G7hRXbp0yfKzTqdDfn4+5syZY3V8T08PvLwct7vz8/ORn59v\nuZ2WloahQ4c6bPs/Zjab4enp6ZRtu0pERESv/7Ovv/4a48aNw8KFC93YlXvxyO8kzz//PB5++GEs\nXrwY/v7+2LFjB9LS0pCbm2sZc/jwYeh0Osvturo6JCcn49Zbb8XYsWOxdevWfs118eJF7NmzB+np\n6f0a/8O8L7zwAkaNGoWxY8di586dlnpaWhqWL1+OefPmwc/PD0eOHEFHRwdWr16N8PBwqNVqPPHE\nE+jo6LCsk5eXh+DgYISGhsJoNPaa76e/9/vvv4+4uDgEBAQgMjISBw8exG9/+1t8+umn+PWvf41h\nw4Zh5cqVAIBTp05hzpw5CAwMxM9+9jPs3r3bsp1z587hl7/8JQICAnDnnXeiqqqqX78/ABiNRsye\nPRvh4eH9XuemI2jAtFqtOHToUK9lzz33nPD29hYffPCBMJvN4vLly2LJkiVi3bp1ljGHDh0SWq1W\nCCFET0+PmDhxovj9738vOjs7RUVFhdBqteLw4cNCCCFKSkrEqFGj+pz/rbfeEpGRkf3u99ChQ8LT\n01OsWbNGdHR0iOLiYjFkyBBRUVEhhBBiyZIlYsSIEeKTTz4RZrNZdHR0iOXLl4sHHnhAtLS0iO++\n+07cd9994vnnnxdCCPGXv/xFBAcHi/LycnHp0iXx0EMPCQCiqqrKsr0ffu9jx46J4cOHi8OHDwuz\n2SxqamrE6dOnhRBCTJs2TbzzzjuWPtva2kRISIgwGo2iu7tbmEwmERgYaBmfkpIiUlNTRXt7uzh5\n8qQIDg4WM2bMsKw/b9488corr1zz+1+9elVotVqxffv2fu+zmxHD7wDWwj9r1qxey5TCf/ToUTF2\n7Nhe41944QWRnZ1tc/74+Hjx4osv9rvfQ4cOCW9vb9He3m5ZlpycLDZs2GDp89FHH7XUzGaz8PX1\nFdXV1ZZlH3/8seUO55FHHhHPPfecpVZeXm41/JmZmWLNmjV99vXT8O/YsUPMnDmz15jMzEzx0ksv\nia6uLuHp6Wm5wxJCiLVr1/YKvzV///vfhb+/f6/fX0Z8zu9E1/OQ8ttvv0VNTQ1GjBhhWWY2m22+\nel1VVYWjR49i27Zt19XbqFGjer1GoNVq0dDQYLn9497Pnj2Lzs5OTJw40bJM/OjzYA0NDZg2bVqv\nbVlTW1uLKVOm9KvHb7/9FseOHeu1T3p6epCRkYGmpiaYzeZefWq1WpSWltrcrtFoxEMPPeS010hu\nFAy/E6lUql63/fz8cPnyZcvts2fPWn4ODw9HVFQUvvrqq+uaY9u2bZgxY4Zi4Ppy4cIFXLlyBUOG\nDAEA1NTUQK/X99m7Wq2Gj48Pzpw5A7Vafc22NBoNamtrLbdramqszhseHo7Kyso+az/dX+Hh4UhI\nSMD+/fuvGdvd3Q0PDw/U1tYiMjLS5rw/aG9vx+7du1FUVGRz7M2OL/i5UFxcHIqKitDa2orGxka8\n9tprltpdd90FHx8fbNq0CR0dHTCbzfjyyy9x4sQJxW1u27YNGRkZ1yxPS0tDdna21fWuXr2K3Nxc\ndHV1oaSkBPv378eDDz7Y51hPT09kZ2dj5cqVOHfuHIQQqKurw8GDBwEAixYtwttvv43Tp0+jvb0d\n69evtzpvVlYW8vPz8eGHH+Lq1auoq6vDmTNnAHx/J/PNN99YxiYlJaG8vBzvvfceuru70d3djdLS\nUpw5cwbe3t544IEHsG7dOly5cgX/+te/sH37dsV9BQC7d+9GUFAQ7rnnHptjb3YMvwtlZGQgOjoa\nWq0W8+bNQ2pqqqXm5eWFffv2obS0FDqdDqNHj8Zjjz2GtrY2AEBJSUmvh78AcOTIETQ1NSElJeWa\nuWpra3s9FP+psLAw+Pn5QaPRID09Hfn5+YiKirI6ftOmTdBqtbjjjjswfPhwzJ07FxUVFQCA+fPn\nY/ny5ZgxYwZuu+02JCYmWt3O3XffjTfffBO/+c1vMHz4cMyaNcvyqGHlypUoKCjAiBEjsHr1agwf\nPhx/+9vfsGPHDmg0GgQHB+OZZ55BZ2cnAOAPf/gDWltboVarkZWVhUcffbTXXHPnzsXLL7/ca5nR\naMTSpUuveZQhI5UQ/DKPm01HRwd+/vOf48svv+zzvQWHDx9Gdna25R14JCc+578J3XLLLdf92gHJ\nhw/7iSTFh/1EkuKRn0hSLn3O76PyxS3wc+WURFLpQDu6RGe/xg4o/AcOHMCKFStgNpuRnZ2Np59+\nWnH8LfDDVFXCQKYkIgXHRXG/x9r9sN9sNmP58uXYv38/Tp06hYKCApw6dcrezRGRi9kd/tLSUkRG\nRiIiIgI+Pj5ITU1FYWGhI3sjIieyO/z19fW9PlQRFhaG+vr6a8YZDAbo9Xro9Xp0o3/PRYjI+ewO\nf19nCPt6y2ROTg5MJhNMJhO84WvvdETkYHaHPywsrNcnuerq6hASEuKQpojI+ewO/5QpU1BRUYGq\nqip0dXVh586dSEpKcmRvROREdp/q8/LywpYtW3DvvffCbDYjMzMT48ePd2RvRORELn17b4AqkOf5\niZzouChGm2jp11i+vZdIUgw/kaQYfiJJMfxEkmL4iSTF8BNJiuEnkhTDTyQphp9IUgw/kaQYfiJJ\nMfxEkmL4iSTF8BNJiuEnkhTDTyQphp9IUgw/kaQYfiJJMfxEkmL4iSTF8BNJiuEnkhTDTyQphp9I\nUgw/kaQYfiJJMfxEkmL4iSTF8BNJymsgK+t0Ovj7+8PT0xNeXl4wmUyO6ouInGxA4QeADz/8EKNH\nj3ZEL0TkQnzYTySpAYVfpVJh7ty5mDx5MgwGQ59jDAYD9Ho99Ho9utE5kOmIyIFUQghh78oNDQ0I\nCQlBc3MzEhMT8frrryM+Pt7q+ABVIKaqEuydjohsOC6K0SZa+jV2QEf+kJAQAEBQUBCSk5NRWlo6\nkM0RkQvZHf729nZcvHjR8vPBgwcRExPjsMaIyLnsfrW/qakJycnJAICenh786le/wrx58xzWGBE5\nl93hj4iIwMmTJx3ZCxG5EE/1EUmK4SeSFMNPJCmGn0hSDD+RpAb8wR5ZXFh2l9XamEe+Vlz3dLNa\nsd7V6a1YDy1Qrg+tu2S1drXslOK6JC8e+YkkxfATSYrhJ5IUw08kKYafSFIMP5GkGH4iSfE8fz89\ntfY9q7UUv1bllf9jgJPPVC5X91y2Wtt8btYAJ79xlTZrrdb8Ng1XXNer+ISj2xl0eOQnkhTDTyQp\nhp9IUgw/kaQYfiJJMfxEkmL4iSQ1oCv2XK8b+Yo97Q9OtVo7H6t8HzryK+Vd3BqtUqz7xP6fYv3l\nmPet1hKHXFFct+jyMMX6L4Za/66AgboiuhTrxzv9FOszb+m2e+7IoscU67fl/MPubbuTy67YQ0Q3\nLoafSFIMP5GkGH4iSTH8RJJi+IkkxfATSYqf5+8nvz8fV6gNbNsBA1sdrwfPtFp7aZpOee6PlK85\n8PLMSDs66h+vK1cV637/bFSsj/p4t2J9go/16x0MrVa+FoIMbB75MzMzERQUhJiYGMuylpYWJCYm\nIioqComJiWhttfFlFkQ06NgMf0ZGBg4cONBrWV5eHhISElBRUYGEhATk5eU5rUEicg6b4Y+Pj0dg\nYGCvZYWFhUhPTwcApKenY+/evc7pjoicxq7n/E1NTdBoNAAAjUaD5uZmq2MNBgMMBgMAoBud9kxH\nRE7g9Ff7c3JyYDKZYDKZ4A1fZ09HRP1kV/jVajUaG79/JbaxsRFBQUEObYqInM+u8CclJcFoNAIA\njEYjFixY4NCmiMj5bD7nX7x4MUpKSnD+/HmEhYVh/fr1ePrpp7Fo0SK89dZbGDNmDHbt2uWKXsmK\nnrNNVmt+u63XAMBsY9t+f75gR0eO0ZR9l2J9vI/yn+9/tYyzWtO9843iuj2K1ZuDzfAXFBT0uby4\nuNjhzRCR6/DtvUSSYviJJMXwE0mK4SeSFMNPJCl+pJfcxksbrljf8uwWxbq3ylOxvmvzHKu1UY2f\nKq4rAx75iSTF8BNJiuEnkhTDTyQphp9IUgw/kaQYfiJJ8Tw/uc3pVaGK9Sm+ypcuL+9Svvx44KnL\n192TTHjkJ5IUw08kKYafSFIMP5GkGH4iSTH8RJJi+IkkxfP85FSdv5hitfb5g/9tY23lKzw9vmKF\nYn3IJ6U2ti83HvmJJMXwE0mK4SeSFMNPJCmGn0hSDD+RpBh+IknxPD85Vc191o8vw1TK5/EXVyUq\n1oceOKlYF4pVsnnkz8zMRFBQEGJiYizLcnNzERoairi4OMTFxWHfvn1ObZKIHM9m+DMyMnDgwIFr\nlq9atQplZWUoKyvD/fff75TmiMh5bIY/Pj4egYGBruiFiFzI7hf8tmzZgtjYWGRmZqK1tdXqOIPB\nAL1eD71ej2502jsdETmYXeF//PHHUVlZibKyMmg0Gjz55JNWx+bk5MBkMsFkMsHbxgc1iMh17Aq/\nWq2Gp6cnPDw8sGzZMpSW8tNTRDcau8Lf2Nho+XnPnj29zgQQ0Y3B5nn+xYsXo6SkBOfPn0dYWBjW\nr1+PkpISlJWVQaVSQafT4Y033nBFrzQIefj7K9Yfueeo1Vrb1Q7FdZs3RCjWfTv/oVgnZTbDX1BQ\ncM2yrKwspzRDRK7Dt/cSSYrhJ5IUw08kKYafSFIMP5Gk+JFeGpCK3PGK9b+O/h+rtQUVKYrr+u7j\nqTxn4pGfSFIMP5GkGH4iSTH8RJJi+IkkxfATSYrhJ5IUz/OTou/S7lSs//Ph1xTrlT3dVmuX/jNM\ncV1fNCrWaWB45CeSFMNPJCmGn0hSDD+RpBh+Ikkx/ESSYviJJMXz/JLzCg1RrK/83f8q1n1Vyn9C\nqScfsVq7dT8/r+9OPPITSYrhJ5IUw08kKYafSFIMP5GkGH4iSTH8RJKyeZ6/trYWS5cuxdmzZ+Hh\n4YGcnBysWLECLS0tePjhh1FdXQ2dToc//elPGDlypCt6puug8lL+L5741zrF+kPDLijW370YpFhX\n/8768eWq4prkbDaP/F5eXti0aRO++uorfPbZZ9i6dStOnTqFvLw8JCQkoKKiAgkJCcjLy3NFv0Tk\nIDbDr9FoMGnSJACAv78/oqOjUV9fj8LCQqSnpwMA0tPTsXfvXud2SkQOdV3P+aurq/HFF19g6tSp\naGpqgkajAfD9HURzc7NTGiQi5+j3e/svXbqElJQUvPrqqwgICOj3BAaDAQaDAQDQjc7r75CInKJf\nR/7u7m6kpKRgyZIlWLhwIQBArVajsfH7L1hsbGxEUFDfL/zk5OTAZDLBZDLBG74OapuIBspm+IUQ\nyMrKQnR0NFavXm1ZnpSUBKPRCAAwGo1YsGCB87okIodTCSGE0oCjR4/innvuwYQJE+Dh8f19xYYN\nGzB16lQsWrQINTU1GDNmDHbt2oXAwEDFyQJUgZiqSnBc92STarLyJbSLPtg+oO3f/cxyxfqIbZ8O\naPt0fY6LYrSJln6Ntfmcf/r06bB2/1BcXHx9nRHRoMF3+BFJiuEnkhTDTyQphp9IUgw/kaQYfiJJ\n8au7bwKet99mtZazs3BA2779beXz+Lrtnw1o++Q+PPITSYrhJ5IUw08kKYafSFIMP5GkGH4iSTH8\nRJLief6bwOknrH9l+vyhbQPadlhJl/IA5a+DoEGMR34iSTH8RJJi+IkkxfATSYrhJ5IUw08kKYaf\nSFI8z38D6Jh/h2K9eP4mhepQxzZDNw0e+YkkxfATSYrhJ5IUw08kKYafSFIMP5GkGH4iSdk8z19b\nW4ulS5fi7Nmz8PDwQE5ODlasWIHc3Fy8+eabuPXWWwEAGzZswP333+/0hmXUMM1TsT7Gy/5z+e9e\nDFKse7cpf56fn+a/cdkMv5eXFzZt2oRJkybh4sWLmDx5MhITEwEAq1atwpo1a5zeJBE5ns3wazQa\naDQaAIC/vz+io6NRX1/v9MaIyLmu6zl/dXU1vvjiC0ydOhUAsGXLFsTGxiIzMxOtra19rmMwGKDX\n66HX69GNzoF3TEQO0e/wX7p0CSkpKXj11VcREBCAxx9/HJWVlSgrK4NGo8GTTz7Z53o5OTkwmUww\nmUzwhq/DGieigelX+Lu7u5GSkoIlS5Zg4cKFAAC1Wg1PT094eHhg2bJlKC0tdWqjRORYNsMvhEBW\nVhaio6OxevVqy/LGxkbLz3v27EFMTIxzOiQip7D5gt+xY8ewfft2TJgwAXFxcQC+P61XUFCAsrIy\nqFQq6HQ6vPHGG05vlq7fxgu3K9Y/vVenWBeNXzqwGxpMbIZ/+vTpEH18NzvP6RPd2PgOPyJJMfxE\nkmL4iSTF8BNJiuEnkhTDTyQplejrPJ6TBKgCMVWV4KrpiKRzXBSjTbT0ayyP/ESSYviJJMXwE0mK\n4SeSFMNPJCmGn0hSDD+RpFx6iW6fUR5o1VVZbp87d87y1d+DzWDtbbD2BbA3ezmyN5/q/h/PXfom\nn5/S6/UwmUzuml7RYO1tsPYFsDd7uas3PuwnkhTDTyQpz9zc3Fx3NjB58mR3Tq9osPY2WPsC2Ju9\n3NGbW5/zE5H78GE/kaQYfiJJuSX8Bw4cwLhx4xAZGYm8vDx3tGCVTqezXKNAr9e7tZfMzEwEBQX1\nuiBKS0sLEhMTERUVhcTERKvXSHRHb7m5uQgNDUVcXBzi4uKwb98+t/RWW1uLWbNmITo6GuPHj8fm\nzZsBuH/fWevLbftNuFhPT4+IiIgQlZWVorOzU8TGxory8nJXt2GVVqsV586dc3cbQgghPvroI3Hi\nxAkxfvx4y7K1a9eKjRs3CiGE2Lhxo3jqqacGTW/r1q0Tr7zyilv6+bGGhgZx4sQJIYQQbW1tIioq\nSpSXl7t931nry137zeVH/tLSUkRGRiIiIgI+Pj5ITU1FYWGhq9u4IcTHxyMwMLDXssLCQqSnpwMA\n0tPTsXfvXne01mdvg4VGo8GkSZMA9L6svLv3nbW+3MXl4a+vr0d4eLjldlhYmFt3wE+pVCrMnTsX\nkydPhsFgcHc712hqaoJGowHw/R9Tc3OzmzvqrT+XbXelH19WfjDtO3sud+9oLg+/6OPMokqlcnUb\nVh07dgyff/459u/fj61bt+Ljjz92d0s3jP5ett1VfnpZ+cHC3svdO5rLwx8WFoba2lrL7bq6OoSE\nhLi6Dat+6CUoKAjJycmD7tLjarXacoXkxsZGBAUFubmjfxtMl223dll5d++7wXS5e5eHf8qUKaio\nqEBVVRW6urqwc+dOJCUlubqNPrW3t+PixYuWnw8ePDjoLj2elJQEo9EIADAajViwYIGbO/q3wXLZ\ndmHlsvLu3nfW+nLbfnP5S4xCiKKiIhEVFSUiIiLESy+95I4W+lRZWSliY2NFbGysuP32293eW2pq\nqggODhZeXl4iNDRU5Ofni/Pnz4vZs2eLyMhIMXv2bHHhwoVB01taWpqIiYkREyZMEPPnzxcNDQ1u\n6e3IkSMCgJgwYYKYOHGimDhxoigqKnL7vrPWl7v2G9/eSyQpvsOPSFIMP5GkGH4iSTH8RJJi+Ikk\nxfATSYrhJ5LU/wOdAGX9nfSgHgAAAABJRU5ErkJggg==\n",
+            "text/plain": [
+              "\u003cFigure size 600x400 with 1 Axes\u003e"
+            ]
+          },
+          "metadata": {
+            "tags": []
+          },
+          "output_type": "display_data"
+        }
+      ],
       "source": [
         "import matplotlib.pylab as plt\n",
         "\n",
@@ -432,7 +517,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": 13,
       "metadata": {
         "colab": {},
         "colab_type": "code",
@@ -474,13 +559,24 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": 14,
       "metadata": {
-        "colab": {},
+        "colab": {
+          "height": 34
+        },
         "colab_type": "code",
-        "id": "DqXBnDfJ7qxL"
+        "id": "DqXBnDfJ7qxL",
+        "outputId": "78f393f8-c4a5-41e0-abe4-ab6a5c394e51"
       },
-      "outputs": [],
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "0.9624\n"
+          ]
+        }
+      ],
       "source": [
         "print(evaluate_model(interpreter))"
       ]
@@ -497,13 +593,24 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": 15,
       "metadata": {
-        "colab": {},
+        "colab": {
+          "height": 34
+        },
         "colab_type": "code",
-        "id": "-9cnwiPp6EGm"
+        "id": "-9cnwiPp6EGm",
+        "outputId": "d82552d7-8a2c-49dc-a19a-56010a013102"
       },
-      "outputs": [],
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "0.9626\n"
+          ]
+        }
+      ],
       "source": [
         "print(evaluate_model(interpreter_quant))"
       ]
@@ -515,7 +622,6 @@
         "id": "L7lfxkor8pgv"
       },
       "source": [
-        "\n",
         "In this example, the compressed model has no difference in the accuracy."
       ]
     },
@@ -537,7 +643,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": 16,
       "metadata": {
         "colab": {},
         "colab_type": "code",
@@ -557,13 +663,29 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": 17,
       "metadata": {
-        "colab": {},
+        "colab": {
+          "height": 34
+        },
         "colab_type": "code",
-        "id": "LwnV4KxwVEoG"
+        "id": "LwnV4KxwVEoG",
+        "outputId": "7d50f90d-6104-43a3-863c-28db9465d483"
       },
-      "outputs": [],
+      "outputs": [
+        {
+          "data": {
+            "text/plain": [
+              "178509092"
+            ]
+          },
+          "execution_count": 17,
+          "metadata": {
+            "tags": []
+          },
+          "output_type": "execute_result"
+        }
+      ],
       "source": [
         "# Convert to TF Lite without quantization\n",
         "resnet_tflite_file = tflite_models_dir/\"resnet_v2_101.tflite\"\n",
@@ -572,13 +694,29 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": 18,
       "metadata": {
-        "colab": {},
+        "colab": {
+          "height": 34
+        },
         "colab_type": "code",
-        "id": "2qkZD0VoVExe"
+        "id": "2qkZD0VoVExe",
+        "outputId": "76a47590-fa91-49b9-f568-4e00b46c9537"
       },
-      "outputs": [],
+      "outputs": [
+        {
+          "data": {
+            "text/plain": [
+              "45182656"
+            ]
+          },
+          "execution_count": 18,
+          "metadata": {
+            "tags": []
+          },
+          "output_type": "execute_result"
+        }
+      ],
       "source": [
         "# Convert to TF Lite with quantization\n",
         "converter.optimizations = [tf.lite.Optimize.DEFAULT]\n",
@@ -588,13 +726,28 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": 19,
       "metadata": {
-        "colab": {},
+        "colab": {
+          "height": 102
+        },
         "colab_type": "code",
-        "id": "vhOjeg1x9Knp"
+        "id": "vhOjeg1x9Knp",
+        "outputId": "c643a660-f815-49f0-ac4b-ac48af3c1203"
       },
-      "outputs": [],
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "-rw-rw-r-- 1 colaboratory-playground 50844828  44K Jun 23 06:04 /tmp/mnist_tflite_models/mnist_model_quant_f16.tflite\n",
+            "-rw-rw-r-- 1 colaboratory-playground 50844828  24K Jun 23 06:12 /tmp/mnist_tflite_models/mnist_model_quant.tflite\n",
+            "-rw-rw-r-- 1 colaboratory-playground 50844828  83K Jun 23 06:12 /tmp/mnist_tflite_models/mnist_model.tflite\n",
+            "-rw-rw-r-- 1 colaboratory-playground 50844828  44M Jun 23 06:13 /tmp/mnist_tflite_models/resnet_v2_101_quantized.tflite\n",
+            "-rw-rw-r-- 1 colaboratory-playground 50844828 171M Jun 23 06:12 /tmp/mnist_tflite_models/resnet_v2_101.tflite\n"
+          ]
+        }
+      ],
       "source": [
         "!ls -lh {tflite_models_dir}/*.tflite"
       ]
@@ -606,7 +759,6 @@
         "id": "qqHLaqFMCjRZ"
       },
       "source": [
-        "\n",
         "The model size reduces from 171 MB to 43 MB.\n",
         "The accuracy of this model on imagenet can be evaluated using the scripts provided for [TFLite accuracy measurement](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/tools/accuracy/ilsvrc).\n",
         "\n",
@@ -618,7 +770,6 @@
     "colab": {
       "collapsed_sections": [],
       "name": "post_training_quant.ipynb",
-      "private_outputs": true,
       "provenance": [],
       "toc_visible": true
     },
diff --git a/tensorflow/lite/g3doc/performance/post_training_quantization.md b/tensorflow/lite/g3doc/performance/post_training_quantization.md
index 1a579430656..dcf251e6d3d 100644
--- a/tensorflow/lite/g3doc/performance/post_training_quantization.md
+++ b/tensorflow/lite/g3doc/performance/post_training_quantization.md
@@ -34,7 +34,7 @@ weights from floating point to integer, which has 8-bits of precision:
 
 <pre>
 import tensorflow as tf
-converter = tf.lite.TFLiteConverter.from_keras_model(keras_model)
+converter = tf.lite.TFLiteConverter.from_saved_model(saved_model_dir)
 <b>converter.optimizations = [tf.lite.Optimize.DEFAULT]</b>
 tflite_quant_model = converter.convert()
 </pre>
@@ -68,7 +68,7 @@ the following steps:
 
 <pre>
 import tensorflow as tf
-converter = tf.lite.TFLiteConverter.from_keras_model(keras_model)
+converter = tf.lite.TFLiteConverter.from_saved_model(saved_model_dir)
 <b>converter.optimizations = [tf.lite.Optimize.DEFAULT]
 def representative_dataset_gen():
   for _ in range(num_calibration_steps):
@@ -96,7 +96,7 @@ the following steps:
 
 <pre>
 import tensorflow as tf
-converter = tf.lite.TFLiteConverter.from_keras_model(keras_model)
+converter = tf.lite.TFLiteConverter.from_saved_model(saved_model_dir)
 converter.optimizations = [tf.lite.Optimize.DEFAULT]
 def representative_dataset_gen():
   for _ in range(num_calibration_steps):
@@ -120,7 +120,7 @@ quantization of weights, use the following steps:
 
 <pre>
 import tensorflow as tf
-converter = tf.lite.TFLiteConverter.from_keras_model(keras_model)
+converter = tf.lite.TFLiteConverter.from_saved_model(saved_model_dir)
 <b>converter.optimizations = [tf.lite.Optimize.DEFAULT]
 converter.target_spec.supported_types = [tf.float16]</b>
 tflite_quant_model = converter.convert()
@@ -142,6 +142,50 @@ The disadvantages of float16 quantization are as follows:
     to float32 when run on the CPU. (Note that the GPU delegate will not perform
     this dequantization, since it can operate on float16 data.)
 
+### Integer only: 16-bit activations with 8-bit weights (experimental)
+
+This is an experimental quantization scheme. It is similar to the "integer only"
+scheme, but activations are quantized based on their range to 16-bits, weights
+are quantized in 8-bit integer and bias is quantized into 64-bit integer. This
+is referred to as 16x8 quantization further.
+
+The main advantage of this quantization is that it can improve accuracy
+significantly, but only slightly increase model size.
+
+<pre>
+import tensorflow as tf
+converter = tf.lite.TFLiteConverter.from_saved_model(saved_model_dir)
+<b>converter.optimizations = [tf.lite.Optimize.DEFAULT]
+converter.target_spec.supported_types = [tf.lite.constants.EXPERIMENTAL_TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8]</b>
+tflite_quant_model = converter.convert()
+</pre>
+
+If 16x8 quantization is not supported for some operators in the model,
+then the model still can be quantized, but unsupported operators kept in float.
+The following option should be added to the target_spec to allow this.
+<pre>
+import tensorflow as tf
+converter = tf.lite.TFLiteConverter.from_saved_model(saved_model_dir)
+converter.optimizations = [tf.lite.Optimize.DEFAULT]
+converter.target_spec.supported_types = [tf.lite.constants.EXPERIMENTAL_TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8,
+<b>tf.lite.OpsSet.TFLITE_BUILTINS</b>]
+tflite_quant_model = converter.convert()
+</pre>
+
+Examples of the use cases where accuracy improvements provided by this
+quantization scheme include: * super-resolution, * audio signal processing such
+as noise cancelling and beamforming, * image de-noising, * HDR reconstruction
+from a single image.
+
+The disadvantage of this quantization is:
+
+*   Currently inference is noticeably slower than 8-bit full integer due to the
+    lack of optimized kernel implementation.
+*   Currently it is incompatible with the existing hardware accelerated TFLite
+    delegates.
+
+Note: This is an experimental feature.
+
 ### Model accuracy
 
 Since weights are quantized post training, there could be an accuracy loss,
diff --git a/tensorflow/lite/interpreter.cc b/tensorflow/lite/interpreter.cc
index cae2ca7dde0..b49aa5031bf 100644
--- a/tensorflow/lite/interpreter.cc
+++ b/tensorflow/lite/interpreter.cc
@@ -86,8 +86,9 @@ TfLiteQuantization GetQuantizationFromLegacy(
 }  // namespace
 
 Interpreter::Interpreter(ErrorReporter* error_reporter)
-    : error_reporter_(error_reporter ? error_reporter
-                                     : DefaultErrorReporter()) {
+    : error_reporter_(error_reporter ? error_reporter : DefaultErrorReporter()),
+      lazy_delegate_provider_(
+          TfLiteDelegatePtr(nullptr, [](TfLiteDelegate*) {})) {
   // TODO(b/128420794): Include the TFLite runtime version in the log.
   // Prod logging is useful for mobile platforms where scraping console logs is
   // critical for debugging.
@@ -175,6 +176,16 @@ TfLiteStatus Interpreter::SetVariables(std::vector<int> variables) {
 }
 
 TfLiteStatus Interpreter::AllocateTensors() {
+  // Apply the default delegate that TFLite will enable at this point to allow
+  // other user-level delegates to be applied first.
+  if (lazy_delegate_provider_) {
+    // The execution will fall back to default implementation if the XNNPACK
+    // delegate fails to be applied. Therefore, we ignore the return status
+    // here and let it fall through the rest of the code.
+    ModifyGraphWithDelegate(std::move(lazy_delegate_provider_));
+    lazy_delegate_provider_.reset();
+  }
+
   return primary_subgraph().AllocateTensors();
 }
 
diff --git a/tensorflow/lite/interpreter.h b/tensorflow/lite/interpreter.h
index 59cab6add6d..41377c4ce1f 100644
--- a/tensorflow/lite/interpreter.h
+++ b/tensorflow/lite/interpreter.h
@@ -347,10 +347,12 @@ class Interpreter {
   /// WARNING: Experimental interface, subject to change
   TfLiteStatus ReleaseNonPersistentMemory();
 
-  /// Update allocations for all tensors. This will redim dependent tensors
-  /// using the input tensor dimensionality as given. This is relatively
-  /// expensive. If you know that your sizes are not changing, you need not call
-  /// this. Returns status of success or failure.
+  // Update allocations for all tensors. This will redim dependent tensors
+  // using the input tensor dimensionality as given. This is relatively
+  // expensive. This *must be* called after the interpreter has been created
+  // and before running inference (and accessing tensor buffers), and *must be*
+  // called again if (and only if) an input tensor is resized. Returns status of
+  // success or failure.
   TfLiteStatus AllocateTensors();
 
   /// Invoke the interpreter (run the whole graph in dependency order).
@@ -594,6 +596,11 @@ class Interpreter {
 
   // A map of resources. Owned by interpreter and shared by multiple subgraphs.
   resource::ResourceMap resources_;
+
+  // Indicating a delegate that the TFLite interpreter will apply by default.
+  // A nullptr value means there's no delegate to be applied by default or the
+  // delegate has been applied and doesn't need to be applied again.
+  TfLiteDelegatePtr lazy_delegate_provider_;
 };
 
 }  // namespace impl
diff --git a/tensorflow/lite/interpreter_builder.cc b/tensorflow/lite/interpreter_builder.cc
index 43d81ef0770..4b491d41881 100644
--- a/tensorflow/lite/interpreter_builder.cc
+++ b/tensorflow/lite/interpreter_builder.cc
@@ -14,6 +14,9 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/interpreter_builder.h"
 
+#if !defined(__ANDROID__) && !defined(__APPLE__) && !defined(_WIN32)
+#include <dlfcn.h>
+#endif
 #include <fcntl.h>
 #include <stdint.h>
 #include <stdio.h>
@@ -114,6 +117,20 @@ const char* kEmptyTensorName = "";
 // For flex delegate, see also the strong override in
 // lite/delegates/flex/delegate.cc.
 TFLITE_ATTRIBUTE_WEAK Interpreter::TfLiteDelegatePtr AcquireFlexDelegate() {
+#if !defined(__ANDROID__) && !defined(__APPLE__) && !defined(_WIN32)
+  // If _pywrap_tensorflow_internal.so is available, use
+  // TF_AcquireFlexDelegate() to initialize flex delegate.
+  void* lib_tf_internal =
+      dlopen("_pywrap_tensorflow_internal.so", RTLD_NOW | RTLD_LOCAL);
+  if (lib_tf_internal) {
+    auto TF_AcquireFlexDelegate =
+        reinterpret_cast<Interpreter::TfLiteDelegatePtr (*)()>(
+            dlsym(lib_tf_internal, "TF_AcquireFlexDelegate"));
+    if (TF_AcquireFlexDelegate) {
+      return TF_AcquireFlexDelegate();
+    }
+  }
+#endif
   return Interpreter::TfLiteDelegatePtr(nullptr, [](TfLiteDelegate*) {});
 }
 
@@ -528,17 +545,7 @@ TfLiteStatus InterpreterBuilder::ParseTensors(
 
 TfLiteStatus InterpreterBuilder::ApplyDelegates(Interpreter* interpreter,
                                                 int num_threads) {
-  // First, apply XNNPACK delegate if applicable.
-  if (num_fp32_tensors_ > 0) {
-    // The execution will fall back to default implementation if the XNNPACK
-    // delegate fails to be applied. Therefore, we ignore the return status
-    // here and let it fall through the rest of the code.
-    if (auto xnnpack_delegate = MaybeCreateXNNPACKDelegate(num_threads)) {
-      interpreter->ModifyGraphWithDelegate(std::move(xnnpack_delegate));
-    }
-  }
-
-  // Secondly, apply Flex delegate if applicable.
+  // Apply Flex delegate if applicable.
   if (has_flex_op_) {
     if (auto flex_delegate = AcquireFlexDelegate()) {
       return interpreter->ModifyGraphWithDelegate(std::move(flex_delegate));
@@ -655,6 +662,11 @@ TfLiteStatus InterpreterBuilder::operator()(
     modified_subgraph->SetVariables(std::move(variables));
   }
 
+  if (num_fp32_tensors_ > 0) {
+    (*interpreter)->lazy_delegate_provider_ =
+        MaybeCreateXNNPACKDelegate(num_threads);
+  }
+
   if (ApplyDelegates(interpreter->get(), num_threads) != kTfLiteOk)
     return cleanup_and_error();
 
diff --git a/tensorflow/lite/java/BUILD b/tensorflow/lite/java/BUILD
index 101e98e3dd1..89be932ab4d 100644
--- a/tensorflow/lite/java/BUILD
+++ b/tensorflow/lite/java/BUILD
@@ -5,6 +5,7 @@ load("@build_bazel_rules_android//android:rules.bzl", "android_library")
 load("//tensorflow/java:build_defs.bzl", "JAVACOPTS")
 load("//tensorflow/lite:build_def.bzl", "tflite_jni_binary")
 load("//tensorflow/lite/java:aar_with_jni.bzl", "aar_with_jni")
+load("//tensorflow/lite/delegates/flex:build_def.bzl", "tflite_flex_android_library")
 
 package(
     default_visibility = ["//visibility:public"],
@@ -15,6 +16,8 @@ exports_files([
     "src/testdata/add.bin",
     "src/testdata/add_unknown_dimensions.bin",
     "src/testdata/grace_hopper_224.jpg",
+    "AndroidManifest.xml",
+    "proguard.flags",
 ])
 
 JAVA_SRCS = glob([
@@ -70,16 +73,20 @@ android_library(
 # EXPERIMENTAL: Android target that supports TensorFlow op execution with TFLite.
 # Note that this library contains *only* the Flex delegate and its Java wrapper for using
 # select TF ops; clients must also include the core `tensorflowlite` runtime.
-android_library(
+#
+# The library is generated by tflite_flex_android_library rule. This rule can also be used
+# to generate trimmed library that only contain kernels for flex ops used in
+# a set of models by listing them in the models parameter. Ex:
+# tflite_flex_android_library(
+#     name = "tensorflowlite_flex",
+#     models = [model1, model2],
+# )
+#
+# The tflite_flex_android_library rule also generate the libtensorflowlite_flex_jni.so as
+# an intermidiate target.
+tflite_flex_android_library(
     name = "tensorflowlite_flex",
-    srcs = ["//tensorflow/lite/delegates/flex/java/src/main/java/org/tensorflow/lite/flex:flex_delegate"],
-    manifest = "AndroidManifest.xml",
-    proguard_specs = ["proguard.flags"],
-    deps = [
-        ":tensorflowlite_java",
-        ":tensorflowlite_native_flex",
-        "@org_checkerframework_qual",
-    ],
+    visibility = ["//visibility:public"],
 )
 
 # EXPERIMENTAL: Android target target for GPU acceleration. Note that this
@@ -131,7 +138,7 @@ java_library(
     srcs = ["//tensorflow/lite/delegates/flex/java/src/main/java/org/tensorflow/lite/flex:flex_delegate"],
     javacopts = JAVACOPTS,
     deps = [
-        ":libtensorflowlite_flex_jni.so",
+        ":libtensorflowlite_flex_jni.so",  # Generated by tflite_flex_android_library rule.
         ":tensorflowlitelib",
         "@org_checkerframework_qual",
     ],
@@ -297,6 +304,7 @@ java_test(
         "no_oss",  # Currently requires --config=monolithic, b/118895218.
         # TODO(b/121204962): Re-enable test after fixing memory leaks.
         "noasan",
+        "notsan",  # TODO(b/158651814) Re-enable after fixing racing condition.
     ],
     test_class = "org.tensorflow.lite.InterpreterFlexTest",
     visibility = ["//visibility:private"],
@@ -387,12 +395,6 @@ cc_library(
     visibility = ["//visibility:private"],
 )
 
-cc_library(
-    name = "tensorflowlite_native_flex",
-    srcs = ["libtensorflowlite_flex_jni.so"],
-    visibility = ["//visibility:private"],
-)
-
 cc_library(
     name = "tensorflowlite_native_gpu",
     srcs = ["libtensorflowlite_gpu_jni.so"],
@@ -408,18 +410,11 @@ tflite_jni_binary(
         "//tensorflow/lite/c:c_api",
         "//tensorflow/lite/c:c_api_experimental",
         "//tensorflow/lite/delegates/nnapi/java/src/main/native",
+        "//tensorflow/lite/delegates/xnnpack:xnnpack_delegate",
         "//tensorflow/lite/java/src/main/native",
     ],
 )
 
-# EXPERIMENTAL: Native target that supports TensorFlow op execution with TFLite.
-tflite_jni_binary(
-    name = "libtensorflowlite_flex_jni.so",
-    deps = [
-        "//tensorflow/lite/delegates/flex/java/src/main/native",
-    ],
-)
-
 # EXPERIMENTAL: Native target that supports GPU acceleration.
 tflite_jni_binary(
     name = "libtensorflowlite_gpu_jni.so",
diff --git a/tensorflow/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java b/tensorflow/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java
index 7c9c5644f47..5993ee7a037 100644
--- a/tensorflow/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java
+++ b/tensorflow/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java
@@ -137,10 +137,37 @@ public final class Interpreter implements AutoCloseable {
       return this;
     }
 
+    /**
+     * Experimental: Enable an optimized set of floating point CPU kernels (provided by XNNPACK).
+     *
+     * <p>Enabling this flag will enable use of a new, highly optimized set of CPU kernels provided
+     * via the XNNPACK delegate. Currently, this is restricted to a subset of floating point
+     * operations. Eventually, we plan to enable this by default, as it can provide significant
+     * peformance benefits for many classes of floating point models. See
+     * https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/delegates/xnnpack/README.md
+     * for more details.
+     *
+     * <p>Things to keep in mind when enabling this flag:
+     *
+     * <ul>
+     *   <li>Startup time and resize time may increase.
+     *   <li>Baseline memory consumption may increase.
+     *   <li>Compatibility with other delegates (e.g., GPU) has not been fully validated.
+     *   <li>Quantized models will not see any benefit.
+     * </ul>
+     *
+     * <p>WARNING: This is an experimental interface that is subject to change.
+     */
+    public Options setUseXNNPACK(boolean useXNNPACK) {
+      this.useXNNPACK = useXNNPACK;
+      return this;
+    }
+
     int numThreads = -1;
     Boolean useNNAPI;
     Boolean allowFp16PrecisionForFp32;
     Boolean allowBufferHandleOutput;
+    Boolean useXNNPACK;
     final List<Delegate> delegates = new ArrayList<>();
   }
 
diff --git a/tensorflow/lite/java/src/main/java/org/tensorflow/lite/NativeInterpreterWrapper.java b/tensorflow/lite/java/src/main/java/org/tensorflow/lite/NativeInterpreterWrapper.java
index 8eb3c66f3b5..5e9a6eecf00 100644
--- a/tensorflow/lite/java/src/main/java/org/tensorflow/lite/NativeInterpreterWrapper.java
+++ b/tensorflow/lite/java/src/main/java/org/tensorflow/lite/NativeInterpreterWrapper.java
@@ -80,6 +80,10 @@ final class NativeInterpreterWrapper implements AutoCloseable {
       allowBufferHandleOutput(interpreterHandle, options.allowBufferHandleOutput.booleanValue());
     }
     applyDelegates(options);
+    if (options.useXNNPACK != null) {
+      useXNNPACK(
+          interpreterHandle, errorHandle, options.useXNNPACK.booleanValue(), options.numThreads);
+    }
     allocateTensors(interpreterHandle, errorHandle);
     this.isMemoryAllocated = true;
   }
@@ -438,6 +442,9 @@ final class NativeInterpreterWrapper implements AutoCloseable {
 
   private static native void allowBufferHandleOutput(long interpreterHandle, boolean allow);
 
+  private static native void useXNNPACK(
+      long interpreterHandle, long errorHandle, boolean state, int numThreads);
+
   private static native long createErrorReporter(int size);
 
   private static native long createModel(String modelPathOrBuffer, long errorHandle);
diff --git a/tensorflow/lite/java/src/main/native/BUILD b/tensorflow/lite/java/src/main/native/BUILD
index 0d3535b29af..52f79615a9f 100644
--- a/tensorflow/lite/java/src/main/native/BUILD
+++ b/tensorflow/lite/java/src/main/native/BUILD
@@ -31,6 +31,7 @@ cc_library(
         "//tensorflow/lite:string_util",
         "//tensorflow/lite:util",
         "//tensorflow/lite/c:common",
+        "//tensorflow/lite/delegates/xnnpack:xnnpack_delegate_hdrs_only",
         "//tensorflow/lite/experimental/tflite_api_dispatcher:tflite_api_dispatcher_with_kernels",
         "//tensorflow/lite/java/jni",
     ],
@@ -45,14 +46,27 @@ cc_library(
     srcs = [
         "builtin_ops_jni.cc",
     ],
+    hdrs = ["op_resolver.h"],
     copts = tflite_copts(),
     deps = [
         ":native_framework_only",
+        "//tensorflow/lite:framework",
         "//tensorflow/lite/kernels:builtin_ops",
     ],
     alwayslink = 1,
 )
 
+# TODO(b/153652701): Generate this target to give CreateOpResolver a custom namespace.
+cc_library(
+    name = "selected_ops_jni",
+    srcs = ["selected_ops_jni.cc"],
+    hdrs = ["op_resolver.h"],
+    copts = tflite_copts(),
+    deps = [
+        "//tensorflow/lite:framework",
+    ],
+)
+
 exports_files(
     [
         "exported_symbols.lds",
diff --git a/tensorflow/lite/java/src/main/native/nativeinterpreterwrapper_jni.cc b/tensorflow/lite/java/src/main/native/nativeinterpreterwrapper_jni.cc
index 690b58ac1f4..7abe0f518f0 100644
--- a/tensorflow/lite/java/src/main/native/nativeinterpreterwrapper_jni.cc
+++ b/tensorflow/lite/java/src/main/native/nativeinterpreterwrapper_jni.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <dlfcn.h>
 #include <jni.h>
 #include <stdio.h>
 #include <time.h>
@@ -20,6 +21,7 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
 #include "tensorflow/lite/experimental/tflite_api_dispatcher/tflite_api_dispatcher.h"
 #include "tensorflow/lite/java/src/main/native/jni_utils.h"
 #include "tensorflow/lite/util.h"
@@ -323,6 +325,59 @@ Java_org_tensorflow_lite_NativeInterpreterWrapper_allowBufferHandleOutput(
   interpreter->SetAllowBufferHandleOutput(allow);
 }
 
+JNIEXPORT void JNICALL
+Java_org_tensorflow_lite_NativeInterpreterWrapper_useXNNPACK(
+    JNIEnv* env, jclass clazz, jlong handle, jlong error_handle, jboolean state,
+    jint num_threads) {
+  // If not using xnnpack, simply don't apply the delegate.
+  if (!state) {
+    return;
+  }
+
+  tflite_api_dispatcher::Interpreter* interpreter =
+      convertLongToInterpreter(env, handle);
+  if (interpreter == nullptr) {
+    return;
+  }
+
+  BufferErrorReporter* error_reporter =
+      convertLongToErrorReporter(env, error_handle);
+  if (error_reporter == nullptr) {
+    return;
+  }
+
+  // We use dynamic loading to avoid taking a hard dependency on XNNPack.
+  // This allows clients that use trimmed builds to save on binary size.
+  auto xnnpack_options_default =
+      reinterpret_cast<decltype(TfLiteXNNPackDelegateOptionsDefault)*>(
+          dlsym(RTLD_DEFAULT, "TfLiteXNNPackDelegateOptionsDefault"));
+  auto xnnpack_create =
+      reinterpret_cast<decltype(TfLiteXNNPackDelegateCreate)*>(
+          dlsym(RTLD_DEFAULT, "TfLiteXNNPackDelegateCreate"));
+  auto xnnpack_delete =
+      reinterpret_cast<decltype(TfLiteXNNPackDelegateDelete)*>(
+          dlsym(RTLD_DEFAULT, "TfLiteXNNPackDelegateDelete"));
+
+  if (xnnpack_options_default && xnnpack_create && xnnpack_delete) {
+    TfLiteXNNPackDelegateOptions options = xnnpack_options_default();
+    if (num_threads > 0) {
+      options.num_threads = num_threads;
+    }
+    tflite_api_dispatcher::Interpreter::TfLiteDelegatePtr delegate(
+        xnnpack_create(&options), xnnpack_delete);
+    if (interpreter->ModifyGraphWithDelegate(std::move(delegate)) !=
+        kTfLiteOk) {
+      ThrowException(env, kIllegalArgumentException,
+                     "Internal error: Failed to apply XNNPACK delegate: %s",
+                     error_reporter->CachedErrorMessage());
+    }
+  } else {
+    ThrowException(env, kIllegalArgumentException,
+                   "Failed to load XNNPACK delegate from current runtime. "
+                   "Have you added the necessary dependencies?");
+  }
+}
+
 JNIEXPORT void JNICALL
 Java_org_tensorflow_lite_NativeInterpreterWrapper_numThreads(JNIEnv* env,
                                                              jclass clazz,
diff --git a/tensorflow/lite/java/src/main/native/op_resolver.h b/tensorflow/lite/java/src/main/native/op_resolver.h
new file mode 100644
index 00000000000..ba9c1bfb487
--- /dev/null
+++ b/tensorflow/lite/java/src/main/native/op_resolver.h
@@ -0,0 +1,26 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_JAVA_SRC_MAIN_NATIVE_OP_RESOLVER_H_
+#define TENSORFLOW_LITE_JAVA_SRC_MAIN_NATIVE_OP_RESOLVER_H_
+
+#include "tensorflow/lite/op_resolver.h"
+
+namespace tflite {
+
+std::unique_ptr<OpResolver> CreateOpResolver();
+
+}
+
+#endif  // TENSORFLOW_LITE_JAVA_SRC_MAIN_NATIVE_OP_RESOLVER_H_
diff --git a/tensorflow/lite/java/src/main/native/selected_ops_jni.cc b/tensorflow/lite/java/src/main/native/selected_ops_jni.cc
new file mode 100644
index 00000000000..d8eb233f90a
--- /dev/null
+++ b/tensorflow/lite/java/src/main/native/selected_ops_jni.cc
@@ -0,0 +1,36 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/java/src/main/native/op_resolver.h"
+#include "tensorflow/lite/mutable_op_resolver.h"
+
+// This method is generated by `gen_selected_ops`.
+// TODO(b/153652701): Instead of relying on a global method, make
+// `gen_selected_ops` generating a header file with custom namespace.
+void RegisterSelectedOps(::tflite::MutableOpResolver* resolver);
+
+namespace tflite {
+// This interface is the unified entry point for creating op resolver
+// regardless if selective registration is being used. C++ client will call
+// this method directly and Java client will call this method indirectly via
+// JNI code in interpreter_jni.cc.
+std::unique_ptr<OpResolver> CreateOpResolver() {
+  std::unique_ptr<MutableOpResolver> resolver =
+      std::unique_ptr<MutableOpResolver>(new MutableOpResolver());
+  RegisterSelectedOps(resolver.get());
+  return std::move(resolver);
+}
+
+}  // namespace tflite
diff --git a/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterMobileNetTest.java b/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterMobileNetTest.java
index 446cf5f7b02..80b3bf3cab9 100644
--- a/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterMobileNetTest.java
+++ b/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterMobileNetTest.java
@@ -54,6 +54,16 @@ public final class InterpreterMobileNetTest {
     runMobileNetFloatTest(new Interpreter.Options().setNumThreads(2));
   }
 
+  @Test
+  public void testMobileNetEnhancedCpuKernels() {
+    runMobileNetFloatTest(new Interpreter.Options().setUseXNNPACK(true));
+  }
+
+  @Test
+  public void testMobileNetEnhancedCpuKernelsMultithreaded() {
+    runMobileNetFloatTest(new Interpreter.Options().setUseXNNPACK(true).setNumThreads(2));
+  }
+
   @Test
   public void testMobileNetQuantized() {
     runMobileNetQuantizedTest(new Interpreter.Options());
@@ -64,6 +74,12 @@ public final class InterpreterMobileNetTest {
     runMobileNetQuantizedTest(new Interpreter.Options().setNumThreads(2));
   }
 
+  @Test
+  public void testMobileNetQuantizedEnhancedCpu() {
+    // The "enhanced CPU flag" should only impact float models, this is a sanity test to confirm.
+    runMobileNetQuantizedTest(new Interpreter.Options().setUseXNNPACK(true));
+  }
+
   private static void runMobileNetFloatTest(Interpreter.Options options) {
     ByteBuffer img =
         TestUtils.getTestImageAsFloatByteBuffer(
diff --git a/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterTest.java b/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterTest.java
index 3daa9fe0766..f1d4ff147b1 100644
--- a/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterTest.java
+++ b/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterTest.java
@@ -409,6 +409,38 @@ public final class InterpreterTest {
     interpreter.close();
   }
 
+  @Test
+  public void testUseXNNPACK() throws Exception {
+    Interpreter interpreter =
+        new Interpreter(MODEL_BUFFER, new Interpreter.Options().setUseXNNPACK(true));
+    float[] oneD = {1.23f, 6.54f, 7.81f};
+    float[][] twoD = {oneD, oneD, oneD, oneD, oneD, oneD, oneD, oneD};
+    float[][][] threeD = {twoD, twoD, twoD, twoD, twoD, twoD, twoD, twoD};
+    float[][][][] fourD = {threeD, threeD};
+    float[][][][] parsedOutputs = new float[2][8][8][3];
+    interpreter.run(fourD, parsedOutputs);
+    float[] outputOneD = parsedOutputs[0][0][0];
+    float[] expected = {3.69f, 19.62f, 23.43f};
+    assertThat(outputOneD).usingTolerance(0.1f).containsExactly(expected).inOrder();
+    interpreter.close();
+  }
+
+  @Test
+  public void testResizeWithEnhancedCpuKernels() throws Exception {
+    Interpreter interpreter =
+        new Interpreter(MODEL_BUFFER, new Interpreter.Options().setUseXNNPACK(true));
+    float[] input = {1.f};
+    float[] output = new float[1];
+    interpreter.run(input, output);
+    assertThat(output).usingTolerance(0.1f).containsExactly(new float[] {3.f}).inOrder();
+
+    // The new input shape should trigger a resize. Inference should still work properly.
+    float[] input2 = {1.f, 2.f};
+    float[] output2 = new float[2];
+    interpreter.run(input2, output2);
+    assertThat(output2).usingTolerance(0.1f).containsExactly(new float[] {3.f, 6.f}).inOrder();
+  }
+
   @Test
   public void testRedundantClose() throws Exception {
     Interpreter interpreter = new Interpreter(MODEL_BUFFER);
diff --git a/tensorflow/lite/kernels/BUILD b/tensorflow/lite/kernels/BUILD
index b16a85c65d8..9d3e5929d82 100644
--- a/tensorflow/lite/kernels/BUILD
+++ b/tensorflow/lite/kernels/BUILD
@@ -14,8 +14,8 @@ package(
 # This will cause TFLite to build with ruy only, providing a smaller binary.
 # WARNING: This build flag is experimental and subject to change.
 config_setting(
-    name = "tflite_with_ruy_only_explicit_true",
-    define_values = {"TFLITE_WITH_RUY_ONLY": "true"},
+    name = "tflite_with_ruy_explicit_true",
+    define_values = {"tflite_with_ruy": "true"},
 )
 
 # Disables usage of ruy as the exclusive GEMM backend in TFLite kernels.
@@ -23,14 +23,14 @@ config_setting(
 # the default GEMM option at runtime.
 # WARNING: This build flag is experimental and subject to change.
 config_setting(
-    name = "tflite_with_ruy_only_explicit_false",
-    define_values = {"TFLITE_WITH_RUY_ONLY": "false"},
+    name = "tflite_with_ruy_explicit_false",
+    define_values = {"tflite_with_ruy": "false"},
 )
 
 ###### Beginning of config_setting's to match aarch64 ######
 #
 # We need to identify the aarch64 instruction set to decide whether to enable
-# TFLITE_WITH_RUY_ONLY by default. This is surprisingly hard to do because select()
+# TFLITE_WITH_RUY by default. This is surprisingly hard to do because select()
 # can only consume config_setting's, these config_settings are not centralized,
 # and the "cpu" value which they define are free-form strings and there is no
 # standardization of the strings that we need to match for the aarch64 architecture.
@@ -239,45 +239,45 @@ cc_test(
 )
 
 cc_library(
-    name = "tflite_with_ruy_only_enabled",
+    name = "tflite_with_ruy_enabled",
     build_for_embedded = True,
-    defines = ["TFLITE_WITH_RUY_ONLY"],
+    defines = ["TFLITE_WITH_RUY"],
     visibility = ["//visibility:private"],
 )
 
 cc_library(
-    name = "tflite_with_ruy_only_and_caching_enabled",
+    name = "tflite_with_ruy_and_caching_enabled",
     defines = [
-        "TFLITE_WITH_RUY_ONLY",
+        "TFLITE_WITH_RUY",
         "TFLITE_WITH_RUY_GEMV",
     ],
     visibility = ["//visibility:private"],
 )
 
 cc_library(
-    name = "tflite_with_ruy_only_default",
+    name = "tflite_with_ruy_default",
     build_for_embedded = True,
     select_deps = {
-        ":chromiumos_arm64": [":tflite_with_ruy_only_enabled"],
-        ":cpu_aarch64": [":tflite_with_ruy_only_enabled"],
-        ":cpu_arm64": [":tflite_with_ruy_only_enabled"],
-        ":cpu_arm64e": [":tflite_with_ruy_only_enabled"],
-        ":cpu_ios_arm64": [":tflite_with_ruy_only_enabled"],
-        ":cpu_ios_arm64e": [":tflite_with_ruy_only_enabled"],
-        ":cpu_arm64_v8a": [":tflite_with_ruy_only_enabled"],
-        "//tensorflow:android_arm": ["tflite_with_ruy_only_enabled"],
+        ":chromiumos_arm64": [":tflite_with_ruy_enabled"],
+        ":cpu_aarch64": [":tflite_with_ruy_enabled"],
+        ":cpu_arm64": [":tflite_with_ruy_enabled"],
+        ":cpu_arm64e": [":tflite_with_ruy_enabled"],
+        ":cpu_ios_arm64": [":tflite_with_ruy_enabled"],
+        ":cpu_ios_arm64e": [":tflite_with_ruy_enabled"],
+        ":cpu_arm64_v8a": [":tflite_with_ruy_enabled"],
+        "//tensorflow:android_arm": ["tflite_with_ruy_enabled"],
         "//conditions:default": [],
     },
     visibility = ["//visibility:private"],
 )
 
 cc_library(
-    name = "tflite_with_ruy_only",
+    name = "tflite_with_ruy",
     build_for_embedded = True,
     select_deps = {
-        ":tflite_with_ruy_only_explicit_true": [":tflite_with_ruy_only_enabled"],
-        ":tflite_with_ruy_only_explicit_false": [],
-        "//conditions:default": [":tflite_with_ruy_only_default"],
+        ":tflite_with_ruy_explicit_true": [":tflite_with_ruy_enabled"],
+        ":tflite_with_ruy_explicit_false": [],
+        "//conditions:default": [":tflite_with_ruy_default"],
     },
 )
 
@@ -291,7 +291,7 @@ cc_library(
     ],
     copts = tflite_copts(),
     deps = [
-        ":tflite_with_ruy_only",
+        ":tflite_with_ruy",
         ":op_macros",
         # For now this unconditionally depends on both ruy and gemmlowp.
         # See the comment inside class CpuBackendContext on the
@@ -311,11 +311,11 @@ cc_library(
     copts = tflite_copts(),
     deps = [
         ":cpu_backend_context",
-        ":tflite_with_ruy_only",
+        ":tflite_with_ruy",
         "//tensorflow/lite/kernels/internal:compatibility",
         "//tensorflow/lite/kernels/internal:types",
         # For now this unconditionally depends on both ruy and gemmlowp.
-        # We only need to depend on gemmlowp when tflite_with_ruy_only
+        # We only need to depend on gemmlowp when tflite_with_ruy
         # is false, but putting these dependencies in a select() seems to
         # defeat copybara's rewriting rules.
         "@ruy//ruy:context",
@@ -349,20 +349,20 @@ cc_library(
     ],
     copts = tflite_copts(),
     deps = [
-        ":tflite_with_ruy_only",
+        ":tflite_with_ruy",
         "//tensorflow/lite/kernels/internal:common",
         "//tensorflow/lite/kernels/internal:compatibility",
         "//tensorflow/lite/kernels/internal:cpu_check",
         "//tensorflow/lite/kernels/internal:types",
         ":cpu_backend_context",
         ":cpu_backend_threadpool",
-        # Depend on ruy regardless of `tflite_with_ruy_only`. See the comment in
+        # Depend on ruy regardless of `tflite_with_ruy`. See the comment in
         # cpu_backend_gemm.h about why ruy is the generic path.
         "@ruy//ruy",
         "@ruy//ruy:matrix",
         "@ruy//ruy:path",
         "@ruy//ruy/profiler:instrumentation",
-        # We only need to depend on gemmlowp and Eigen when tflite_with_ruy_only
+        # We only need to depend on gemmlowp and Eigen when tflite_with_ruy
         # is false, but putting these dependencies in a select() seems to
         # defeat copybara's rewriting rules.
         "@gemmlowp",
@@ -605,7 +605,7 @@ cc_library(
         "//tensorflow/lite/kernels/internal:cppmath",
         "//tensorflow/lite:string",
         "@farmhash_archive//:farmhash",
-    ] + [":tflite_with_ruy_only_and_caching_enabled"],
+    ] + [":tflite_with_ruy_and_caching_enabled"],
 )
 
 cc_library(
diff --git a/tensorflow/lite/kernels/add.cc b/tensorflow/lite/kernels/add.cc
index d6e5db90a97..bda475bdc35 100644
--- a/tensorflow/lite/kernels/add.cc
+++ b/tensorflow/lite/kernels/add.cc
@@ -34,12 +34,12 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/internal/types.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/op_macros.h"
 
 namespace tflite {
 namespace ops {
 namespace builtin {
 namespace add {
-
 // This file has three implementation of Add.
 enum KernelType {
   kReference,
@@ -323,9 +323,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
                       EvalAddQuantized<kernel_type>(context, node, params, data,
                                                     input1, input2, output));
   } else {
-    context->ReportError(context,
-                         "Inputs and outputs not all float|uint8|int16 types.");
-    return kTfLiteError;
+    TF_LITE_UNSUPPORTED_TYPE(context, output->type, "Add");
   }
 
   return kTfLiteOk;
diff --git a/tensorflow/lite/kernels/batch_matmul.cc b/tensorflow/lite/kernels/batch_matmul.cc
index 8bc23c9c94a..a414a226504 100644
--- a/tensorflow/lite/kernels/batch_matmul.cc
+++ b/tensorflow/lite/kernels/batch_matmul.cc
@@ -19,6 +19,7 @@ limitations under the License.
 
 #include <algorithm>
 #include <cstdint>
+#include <limits>
 
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
@@ -52,6 +53,14 @@ enum KernelType {
 };
 
 struct OpData {
+  // The scaling factor from input to output (aka the 'real multiplier') can
+  // be represented as a fixed point multiplier plus a left shift.
+  int32_t output_multiplier;
+  int output_shift;
+  // The range of the fused activation layer. For example for kNone and
+  // uint8_t these would be 0 and 255.
+  int32_t output_activation_min;
+  int32_t output_activation_max;
   // The index of the temporary tensors where we store transposed LHS/RHS.
   int scratch_tensor_index;
   bool rhs_transposed;
@@ -274,6 +283,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 
   OpContext op_context(context, node);
   TF_LITE_ENSURE_OK(context, InitializeTemporaries(context, node, &op_context));
+  OpData* op_data = reinterpret_cast<OpData*>(node->user_data);
 
   bool adj_x = op_context.params->adj_x;
   bool adj_y = op_context.params->adj_y;
@@ -282,7 +292,24 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   const TfLiteTensor* rhs_data = GetInput(context, node, kInputRHSTensor);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
-  TF_LITE_ENSURE_TYPES_EQ(context, lhs_data->type, kTfLiteFloat32);
+  // Note that quantized inference requires that all tensors have their
+  // parameters set. This is usually done during quantized training.
+  if (lhs_data->type == kTfLiteInt8) {
+    double real_multiplier = 0.0;
+    TF_LITE_ENSURE_STATUS(GetQuantizedConvolutionMultipler(
+        context, lhs_data, rhs_data, output, &real_multiplier));
+    int exponent;
+    QuantizeMultiplier(real_multiplier, &op_data->output_multiplier, &exponent);
+    op_data->output_shift = exponent;
+    // BatchMatMul has no fused activation functions. Therefore, set
+    // output activation min and max to min and max of int8_t type,
+    // respecitvely.
+    op_data->output_activation_min = std::numeric_limits<int8_t>::min();
+    op_data->output_activation_max = std::numeric_limits<int8_t>::max();
+  }
+
+  TF_LITE_ENSURE(context, lhs_data->type == kTfLiteFloat32 ||
+                              lhs_data->type == kTfLiteInt8);
   TF_LITE_ENSURE(context, rhs_data->type == kTfLiteFloat32 ||
                               rhs_data->type == kTfLiteInt8);
   // Support dimensions between 2 and 4, inclusive.
@@ -433,6 +460,41 @@ TfLiteStatus EvalHybrid(TfLiteContext* context, TfLiteNode* node, OpData* data,
   return kTfLiteOk;
 }
 
+template <KernelType kernel_type>
+TfLiteStatus EvalInt8(TfLiteContext* context, const OpData* data,
+                      const RuntimeShape& lhs_shape, const TfLiteTensor* lhs,
+                      const RuntimeShape& rhs_shape, const TfLiteTensor* rhs,
+                      const RuntimeShape& output_shape, TfLiteTensor* output) {
+  // Reuse params struct from FullyConnected Op.
+  FullyConnectedParams op_params;
+  int32_t input_offset = -lhs->params.zero_point;
+  int32_t filter_offset = -rhs->params.zero_point;
+  int32_t output_offset = output->params.zero_point;
+  op_params.input_offset = input_offset;
+  op_params.weights_offset = filter_offset;
+  op_params.output_offset = output_offset;
+  op_params.output_multiplier = data->output_multiplier;
+  op_params.output_shift = data->output_shift;
+  op_params.quantized_activation_min = data->output_activation_min;
+  op_params.quantized_activation_max = data->output_activation_max;
+  op_params.lhs_cacheable = IsConstantTensor(lhs);
+  op_params.rhs_cacheable = IsConstantTensor(rhs);
+
+  if (kernel_type == kReference) {
+    reference_ops::BatchMatMul(op_params, rhs_shape, GetTensorData<int8_t>(rhs),
+                               lhs_shape, GetTensorData<int8_t>(lhs),
+                               GetTensorShape(output),
+                               GetTensorData<int8_t>(output));
+  } else {
+    optimized_ops::BatchMatMul(op_params, rhs_shape, GetTensorData<int8_t>(rhs),
+                               lhs_shape, GetTensorData<int8_t>(lhs),
+                               GetTensorShape(output),
+                               GetTensorData<int8_t>(output),
+                               CpuBackendContext::GetFromContext(context));
+  }
+  return kTfLiteOk;
+}
+
 template <KernelType kernel_type>
 TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
                            OpData* data, const RuntimeShape& lhs_shape,
@@ -448,25 +510,39 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
     return EvalHybrid<kernel_type>(
         context, node, data, lhs_shape, lhs, rhs_shape, rhs, input_quantized,
         scaling_factors, accum_scratch, row_sums, input_offsets, output);
+  } else if (lhs->type == kTfLiteInt8) {
+    return EvalInt8<kernel_type>(context, data, lhs_shape, lhs, rhs_shape, rhs,
+                                 GetTensorShape(output), output);
   } else {
-    TF_LITE_KERNEL_LOG(context,
-                       "Currently only hybrid quantization is supported.\n");
+    TF_LITE_KERNEL_LOG(
+        context, "Currently only hybrid and int8 quantization is supported.\n");
     return kTfLiteError;
   }
   return kTfLiteOk;
 }
 
-TfLiteTensor* GetRhs(TfLiteContext* context, TfLiteNode* node,
-                     const TfLiteTensor* rhs) {
+TfLiteTensor* GetTempRhs(TfLiteContext* context, TfLiteNode* node,
+                         const TfLiteTensor* rhs) {
   TfLiteTensor* transposed_rhs = GetTemporary(context, node, 1);
   if (rhs->type == kTfLiteInt8) {
-    // Get the quantization params from the weights tensors.
+    // Get the quantization params from the RHS tensor.
     transposed_rhs->params.scale = rhs->params.scale;
     transposed_rhs->params.zero_point = rhs->params.zero_point;
   }
   return transposed_rhs;
 }
 
+TfLiteTensor* GetTempLhs(TfLiteContext* context, TfLiteNode* node,
+                         const TfLiteTensor* lhs) {
+  TfLiteTensor* transposed_lhs = GetTemporary(context, node, 0);
+  if (lhs->type == kTfLiteInt8) {
+    // Get the quantization params from the LHS tensor.
+    transposed_lhs->params.scale = lhs->params.scale;
+    transposed_lhs->params.zero_point = lhs->params.zero_point;
+  }
+  return transposed_lhs;
+}
+
 // Perform a batch matrix multiply on
 // LHS <..., A, B>  X  RHS<..., B, C>
 // where the leading dimensions of LHS and RHS obey broadcasting rules
@@ -491,8 +567,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   bool adj_y = op_context.params->adj_y;
   bool adj_x = op_context.params->adj_x;
 
-  const TfLiteTensor* rhs_tensor = adj_y ? rhs : GetRhs(context, node, rhs);
-  const TfLiteTensor* lhs_tensor = adj_x ? GetTemporary(context, node, 0) : lhs;
+  const TfLiteTensor* rhs_tensor = adj_y ? rhs : GetTempRhs(context, node, rhs);
+  const TfLiteTensor* lhs_tensor = adj_x ? GetTempLhs(context, node, lhs) : lhs;
   if (!adj_y) {
     // TODO(b/154760341) Constant tensors should already be transposed, but
     // we transpose once if necessary for now.
diff --git a/tensorflow/lite/kernels/batch_matmul_test.cc b/tensorflow/lite/kernels/batch_matmul_test.cc
index 5e52479f49b..98df8ebe3db 100644
--- a/tensorflow/lite/kernels/batch_matmul_test.cc
+++ b/tensorflow/lite/kernels/batch_matmul_test.cc
@@ -24,8 +24,19 @@ limitations under the License.
 #include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
+
+namespace ops {
+namespace builtin {
+
+TfLiteRegistration* Register_BATCH_MATMUL_REF();
+TfLiteRegistration* Register_BATCH_MATMUL_GENERIC_OPTIMIZED();
+
+}  // namespace builtin
+}  // namespace ops
+
 namespace {
 
+using ::testing::ElementsAre;
 using ::testing::ElementsAreArray;
 
 template <typename T>
@@ -53,7 +64,20 @@ class BatchMatMulOpModel : public SingleOpModel {
   int output_id_;
 };
 
-TEST(BatchMatMulOpModelTest, Float32Test_Simple) {
+const auto kKernelMap = new std::map<string, TfLiteRegistration*>({
+    {"Reference", ops::builtin::Register_BATCH_MATMUL_REF()},
+    {"GenericOptimized",
+     ops::builtin::Register_BATCH_MATMUL_GENERIC_OPTIMIZED()},
+});
+
+class BatchMatMulOpTest : public SingleOpTest {
+ protected:
+  const std::map<string, TfLiteRegistration*>& GetKernelMap() override {
+    return *kKernelMap;
+  }
+};
+
+TEST_P(BatchMatMulOpTest, Float32Test_Simple) {
   BatchMatMulOpModel<float> model({TensorType_FLOAT32, {1, 2, 3}},
                                   {TensorType_FLOAT32, {1, 3, 4}});
   model.PopulateTensor<float>(model.lhs(), {1, 2, 3, 4, 5, 6});
@@ -65,7 +89,7 @@ TEST(BatchMatMulOpModelTest, Float32Test_Simple) {
   EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 2, 4}));
 }
 
-TEST(BatchMatMulOpModelTest, Float32Test_SimpleRHSAdjoint) {
+TEST_P(BatchMatMulOpTest, Float32Test_SimpleRHSAdjoint) {
   BatchMatMulOpModel<float> model({TensorType_FLOAT32, {1, 2, 3}},
                                   {TensorType_FLOAT32, {1, 4, 3}}, false, true);
   model.PopulateTensor<float>(model.lhs(), {1, 2, 3, 4, 5, 6});
@@ -77,7 +101,7 @@ TEST(BatchMatMulOpModelTest, Float32Test_SimpleRHSAdjoint) {
   EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 2, 4}));
 }
 
-TEST(BatchMatMulOpModelTest, Float32Test_SimpleLHSAdjoint) {
+TEST_P(BatchMatMulOpTest, Float32Test_SimpleLHSAdjoint) {
   BatchMatMulOpModel<float> model({TensorType_FLOAT32, {1, 3, 2}},
                                   {TensorType_FLOAT32, {1, 3, 4}}, true, false);
   model.PopulateTensor<float>(model.lhs(), {1, 4, 2, 5, 3, 6});
@@ -89,7 +113,7 @@ TEST(BatchMatMulOpModelTest, Float32Test_SimpleLHSAdjoint) {
   EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 2, 4}));
 }
 
-TEST(BatchMatMulOpModelTest, Float32Test_BatchSizeTwo) {
+TEST_P(BatchMatMulOpTest, Float32Test_BatchSizeTwo) {
   BatchMatMulOpModel<float> model({TensorType_FLOAT32, {2, 2, 3}},
                                   {TensorType_FLOAT32, {2, 3, 4}});
   model.PopulateTensor<float>(model.lhs(),
@@ -105,7 +129,7 @@ TEST(BatchMatMulOpModelTest, Float32Test_BatchSizeTwo) {
   EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({2, 2, 4}));
 }
 
-TEST(BatchMatMulOpModelTest, Float32Test_Broadcast) {
+TEST_P(BatchMatMulOpTest, Float32Test_Broadcast) {
   BatchMatMulOpModel<float> model({TensorType_FLOAT32, {2, 2, 3}},
                                   {TensorType_FLOAT32, {3, 4}});
   model.PopulateTensor<float>(model.lhs(),
@@ -121,7 +145,7 @@ TEST(BatchMatMulOpModelTest, Float32Test_Broadcast) {
   EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({2, 2, 4}));
 }
 
-TEST(BatchMatMulOpModelTest, Float32Test_BroadcastLHSAdjoint) {
+TEST_P(BatchMatMulOpTest, Float32Test_BroadcastLHSAdjoint) {
   BatchMatMulOpModel<float> model({TensorType_FLOAT32, {2, 3, 2}},
                                   {TensorType_FLOAT32, {3, 4}}, true, false);
   model.PopulateTensor<float>(model.lhs(),
@@ -137,7 +161,7 @@ TEST(BatchMatMulOpModelTest, Float32Test_BroadcastLHSAdjoint) {
   EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({2, 2, 4}));
 }
 
-TEST(BatchMatMulOpModelTest, Float32Test_Broadcast2) {
+TEST_P(BatchMatMulOpTest, Float32Test_Broadcast2) {
   BatchMatMulOpModel<float> model({TensorType_FLOAT32, {2, 1, 3, 2}},
                                   {TensorType_FLOAT32, {3, 2, 4}});
   model.PopulateTensor<float>(model.lhs(),
@@ -161,7 +185,7 @@ TEST(BatchMatMulOpModelTest, Float32Test_Broadcast2) {
   EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({2, 3, 3, 4}));
 }
 
-TEST(BatchMatMulOpModelTest, Float32Test_Broadcast2LHSAdjoint) {
+TEST_P(BatchMatMulOpTest, Float32Test_Broadcast2LHSAdjoint) {
   BatchMatMulOpModel<float> model({TensorType_FLOAT32, {2, 1, 2, 3}},
                                   {TensorType_FLOAT32, {3, 2, 4}}, true, false);
   model.PopulateTensor<float>(model.lhs(),
@@ -185,7 +209,7 @@ TEST(BatchMatMulOpModelTest, Float32Test_Broadcast2LHSAdjoint) {
   EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({2, 3, 3, 4}));
 }
 
-TEST(BatchMatMulOpModelTest, Float32Test_Broadcast2RHSAdjoint) {
+TEST_P(BatchMatMulOpTest, Float32Test_Broadcast2RHSAdjoint) {
   BatchMatMulOpModel<float> model({TensorType_FLOAT32, {2, 1, 3, 2}},
                                   {TensorType_FLOAT32, {3, 4, 2}}, false, true);
   model.PopulateTensor<float>(model.lhs(),
@@ -208,7 +232,7 @@ TEST(BatchMatMulOpModelTest, Float32Test_Broadcast2RHSAdjoint) {
   EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({2, 3, 3, 4}));
 }
 
-TEST(BatchMatMulOpModelTest, Float32Test_Broadcast2BothAdjoint) {
+TEST_P(BatchMatMulOpTest, Float32Test_Broadcast2BothAdjoint) {
   BatchMatMulOpModel<float> model({TensorType_FLOAT32, {2, 1, 2, 3}},
                                   {TensorType_FLOAT32, {3, 4, 2}}, true, true);
   model.PopulateTensor<float>(model.lhs(),
@@ -231,7 +255,7 @@ TEST(BatchMatMulOpModelTest, Float32Test_Broadcast2BothAdjoint) {
   EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({2, 3, 3, 4}));
 }
 
-TEST(BatchMatMulOpModelTest, Float32Test_BroadcastFromRHS) {
+TEST_P(BatchMatMulOpTest, Float32Test_BroadcastFromRHS) {
   BatchMatMulOpModel<float> model({TensorType_FLOAT32, {4, 5}},
                                   {TensorType_FLOAT32, {3, 1, 5, 2}});
   model.PopulateTensor<float>(
@@ -251,6 +275,10 @@ TEST(BatchMatMulOpModelTest, Float32Test_BroadcastFromRHS) {
   EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({3, 1, 4, 2}));
 }
 
+INSTANTIATE_TEST_SUITE_P(
+    BatchMatMulOpTest, BatchMatMulOpTest,
+    ::testing::ValuesIn(SingleOpTest::GetKernelTags(*kKernelMap)));
+
 // In the hybrid model the weights are quantized int8. But the input
 // and output are expected to be in float precision.
 class HybridAsymmetricBatchMatMulOpModel : public SingleOpModel {
@@ -304,7 +332,14 @@ class HybridAsymmetricBatchMatMulOpModel : public SingleOpModel {
   int input_size_;
 };
 
-TEST(HybridAsymmetricBatchMatMulOpTest, SimpleTestQuantizedInt8) {
+class HybridAsymmetricBatchMatMulOpTest : public SingleOpTest {
+ protected:
+  const std::map<string, TfLiteRegistration*>& GetKernelMap() override {
+    return *kKernelMap;
+  }
+};
+
+TEST_P(HybridAsymmetricBatchMatMulOpTest, SimpleTestQuantizedInt8) {
   HybridAsymmetricBatchMatMulOpModel m(
       /*units=*/3, /*batches=*/2,
       /*lhs=*/{TensorType_FLOAT32, {2, 10}},
@@ -335,7 +370,7 @@ TEST(HybridAsymmetricBatchMatMulOpTest, SimpleTestQuantizedInt8) {
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 3}));
 }
 
-TEST(HybridAsymmetricBatchMatMulOpTest, QuantizedInt8BroadcastWeights) {
+TEST_P(HybridAsymmetricBatchMatMulOpTest, QuantizedInt8BroadcastWeights) {
   HybridAsymmetricBatchMatMulOpModel m(
       /*units=*/3, /*batches=*/2,
       /*lhs=*/{TensorType_FLOAT32, {2, 2, 10}},
@@ -366,7 +401,7 @@ TEST(HybridAsymmetricBatchMatMulOpTest, QuantizedInt8BroadcastWeights) {
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 2, 3}));
 }
 
-TEST(HybridAsymmetricBatchMatMulOpTest, QuantizedInt8BroadcastBigWeights) {
+TEST_P(HybridAsymmetricBatchMatMulOpTest, QuantizedInt8BroadcastBigWeights) {
   HybridAsymmetricBatchMatMulOpModel m(
       /*units=*/9, /*batches=*/2,
       /*lhs=*/{TensorType_FLOAT32, {2, 2, 10}},
@@ -401,7 +436,7 @@ TEST(HybridAsymmetricBatchMatMulOpTest, QuantizedInt8BroadcastBigWeights) {
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 2, 9}));
 }
 
-TEST(HybridAsymmetricBatchMatMulOpTest, QuantizedInt8BroadcastInputs) {
+TEST_P(HybridAsymmetricBatchMatMulOpTest, QuantizedInt8BroadcastInputs) {
   HybridAsymmetricBatchMatMulOpModel m(
       /*units=*/3, /*batches=*/2,
       /*lhs=*/{TensorType_FLOAT32, {2, 10}},
@@ -431,5 +466,96 @@ TEST(HybridAsymmetricBatchMatMulOpTest, QuantizedInt8BroadcastInputs) {
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 2, 3}));
 }
 
+INSTANTIATE_TEST_SUITE_P(
+    HybridAsymmetricBatchMatMulOpTest, HybridAsymmetricBatchMatMulOpTest,
+    ::testing::ValuesIn(SingleOpTest::GetKernelTags(*kKernelMap)));
+
+class QuantizedBatchMatMulOpModel : public SingleOpModel {
+ public:
+  QuantizedBatchMatMulOpModel(int units, int batches, const TensorData& lhs,
+                              const TensorData& output = {TensorType_INT8},
+                              bool adj_x = false, bool adj_y = false)
+      : units_(units), batches_(batches) {
+    int total_input_size = 1;
+    for (size_t i = 0; i < lhs.shape.size(); ++i) {
+      total_input_size *= lhs.shape[i];
+    }
+    input_size_ = total_input_size / batches_;
+
+    lhs_id_ = AddInput(lhs);
+    rhs_id_ = AddInput({lhs.type, {input_size_, units_}, lhs.min, lhs.max});
+
+    output_id_ = AddOutput(output);
+
+    SetBuiltinOp(BuiltinOperator_BATCH_MATMUL,
+                 BuiltinOptions_BatchMatMulOptions,
+                 CreateBatchMatMulOptions(builder_, adj_x, adj_y).Union());
+    BuildInterpreter({GetShape(lhs_id_), GetShape(rhs_id_)});
+  }
+
+  template <typename T>
+  void SetWeights(const std::vector<float>& data) {
+    QuantizeAndPopulate<T>(rhs_id_, data);
+  }
+
+  template <typename T>
+  void SetInput(const std::vector<float>& data) {
+    QuantizeAndPopulate<T>(lhs_id_, data);
+  }
+
+  template <typename T>
+  std::vector<T> GetOutput() {
+    return ExtractVector<T>(output_id_);
+  }
+
+  template <typename T>
+  std::vector<float> GetDequantizedOutput() {
+    return Dequantize<T>(ExtractVector<T>(output_id_), GetScale(output_id_),
+                         GetZeroPoint(output_id_));
+  }
+
+ protected:
+  int lhs_id_;
+  int rhs_id_;
+  int output_id_;
+  int units_;
+  int batches_;
+  int input_size_;
+};
+
+class QuantizedBatchMatMulOpTest : public SingleOpTest {
+ protected:
+  const std::map<string, TfLiteRegistration*>& GetKernelMap() override {
+    return *kKernelMap;
+  }
+};
+
+TEST_P(QuantizedBatchMatMulOpTest, SimpleTestQuantizedInt8) {
+  QuantizedBatchMatMulOpModel m(
+      /*units=*/3, /*batches*/ 2,
+      /*lhs=*/{TensorType_INT8, {2, 10}, -63.5, 64},
+      /*output=*/{TensorType_INT8, {}, -127, 128});
+
+  m.SetWeights<int8_t>({
+      1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5,  5,  5,
+      6, 6, 6, 7, 7, 7, 8, 8, 8, 9, 9, 9, 10, 10, 10,
+  });
+
+  m.SetInput<int8_t>({
+      1, 2, 3, 4, 5, 6, 7, 8,  -9, -10,  // b = 0
+      1, 2, 3, 4, 5, 6, 7, -8, 9,  -10,  // b = 1
+  });
+
+  m.Invoke();
+
+  EXPECT_THAT(m.GetDequantizedOutput<int8_t>(),
+              ElementsAreArray(ArrayFloatNear({23, 23, 23, 57, 57, 57})));
+  EXPECT_THAT(m.GetOutput<int8_t>(), ElementsAre(22, 22, 22, 56, 56, 56));
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    QuantizedBatchMatMulOpTest, QuantizedBatchMatMulOpTest,
+    ::testing::ValuesIn(SingleOpTest::GetKernelTags(*kKernelMap)));
+
 }  // namespace
 }  // namespace tflite
diff --git a/tensorflow/lite/kernels/bidirectional_sequence_lstm.cc b/tensorflow/lite/kernels/bidirectional_sequence_lstm.cc
index 439fc94afad..fd60fe573ef 100644
--- a/tensorflow/lite/kernels/bidirectional_sequence_lstm.cc
+++ b/tensorflow/lite/kernels/bidirectional_sequence_lstm.cc
@@ -318,11 +318,11 @@ TfLiteStatus CheckLstmTensorDimensionsAndTypes(
   TF_LITE_ENSURE_EQ(context, forget_gate_bias->dims->data[0], n_cell);
   TF_LITE_ENSURE_TYPES_EQ(context, forget_gate_bias->type, kTfLiteFloat32);
 
-  const TfLiteTensor* cell_bias =
+  const TfLiteTensor* cell_gate_bias =
       GetInput(context, node, cell_gate_bias_tensor);
-  TF_LITE_ENSURE_EQ(context, cell_bias->dims->size, 1);
-  TF_LITE_ENSURE_EQ(context, cell_bias->dims->data[0], n_cell);
-  TF_LITE_ENSURE_EQ(context, cell_bias->type, kTfLiteFloat32);
+  TF_LITE_ENSURE_EQ(context, cell_gate_bias->dims->size, 1);
+  TF_LITE_ENSURE_EQ(context, cell_gate_bias->dims->data[0], n_cell);
+  TF_LITE_ENSURE_EQ(context, cell_gate_bias->type, kTfLiteFloat32);
 
   const TfLiteTensor* output_gate_bias =
       GetInput(context, node, output_gate_bias_tensor);
@@ -886,7 +886,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       GetOptionalInputTensor(context, node, kFwInputGateBiasTensor);
   const TfLiteTensor* fw_forget_gate_bias =
       GetInput(context, node, kFwForgetGateBiasTensor);
-  const TfLiteTensor* fw_cell_bias =
+  const TfLiteTensor* fw_cell_gate_bias =
       GetInput(context, node, kFwCellGateBiasTensor);
   const TfLiteTensor* fw_output_gate_bias =
       GetInput(context, node, kFwOutputGateBiasTensor);
@@ -934,7 +934,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       GetOptionalInputTensor(context, node, kBwInputGateBiasTensor);
   const TfLiteTensor* bw_forget_gate_bias =
       GetInput(context, node, kBwForgetGateBiasTensor);
-  const TfLiteTensor* bw_cell_bias =
+  const TfLiteTensor* bw_cell_gate_bias =
       GetInput(context, node, kBwCellGateBiasTensor);
   const TfLiteTensor* bw_output_gate_bias =
       GetInput(context, node, kBwOutputGateBiasTensor);
@@ -1029,7 +1029,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
           /*output_layer_norm_coefficients=*/nullptr, real_aux_input,
           fw_aux_input_to_input_weights, fw_aux_input_to_forget_weights,
           fw_aux_input_to_cell_weights, fw_aux_input_to_output_weights,
-          fw_input_gate_bias, fw_forget_gate_bias, fw_cell_bias,
+          fw_input_gate_bias, fw_forget_gate_bias, fw_cell_gate_bias,
           fw_output_gate_bias, fw_projection_weights, fw_projection_bias,
           &lstm_params,
           /*forward_sequence=*/true, time_major, /*output_offset=*/0,
@@ -1049,7 +1049,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
           /*output_layer_norm_coefficients=*/nullptr, real_aux_input,
           bw_aux_input_to_input_weights, bw_aux_input_to_forget_weights,
           bw_aux_input_to_cell_weights, bw_aux_input_to_output_weights,
-          bw_input_gate_bias, bw_forget_gate_bias, bw_cell_bias,
+          bw_input_gate_bias, bw_forget_gate_bias, bw_cell_gate_bias,
           bw_output_gate_bias, bw_projection_weights, bw_projection_bias,
           &lstm_params,
           /*forward_sequence=*/false, time_major, bw_output_offset,
@@ -1099,7 +1099,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
           /*output_layer_norm_coefficients=*/nullptr, real_aux_input,
           fw_aux_input_to_input_weights, fw_aux_input_to_forget_weights,
           fw_aux_input_to_cell_weights, fw_aux_input_to_output_weights,
-          fw_input_gate_bias, fw_forget_gate_bias, fw_cell_bias,
+          fw_input_gate_bias, fw_forget_gate_bias, fw_cell_gate_bias,
           fw_output_gate_bias, fw_projection_weights, fw_projection_bias,
           &lstm_params,
           /*forward_sequence=*/true, /*time_major=*/true, /*output_offset=*/0,
@@ -1125,7 +1125,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
           /*output_layer_norm_coefficients=*/nullptr, real_aux_input,
           bw_aux_input_to_input_weights, bw_aux_input_to_forget_weights,
           bw_aux_input_to_cell_weights, bw_aux_input_to_output_weights,
-          bw_input_gate_bias, bw_forget_gate_bias, bw_cell_bias,
+          bw_input_gate_bias, bw_forget_gate_bias, bw_cell_gate_bias,
           bw_output_gate_bias, bw_projection_weights, bw_projection_bias,
           &lstm_params,
           /*forward_sequence=*/false, /*time_major=*/true, bw_output_offset,
diff --git a/tensorflow/lite/kernels/bidirectional_sequence_lstm_test.cc b/tensorflow/lite/kernels/bidirectional_sequence_lstm_test.cc
index 3a52de130e3..778751aa04b 100644
--- a/tensorflow/lite/kernels/bidirectional_sequence_lstm_test.cc
+++ b/tensorflow/lite/kernels/bidirectional_sequence_lstm_test.cc
@@ -89,7 +89,7 @@ class BidirectionalLSTMOpModel : public SingleOpModel {
       fw_input_gate_bias_ = AddInput(TensorType_FLOAT32);
     }
     fw_forget_gate_bias_ = AddInput(TensorType_FLOAT32);
-    fw_cell_bias_ = AddInput(TensorType_FLOAT32);
+    fw_cell_gate_bias_ = AddInput(TensorType_FLOAT32);
     fw_output_gate_bias_ = AddInput(TensorType_FLOAT32);
 
     if (use_projection_weights) {
@@ -144,7 +144,7 @@ class BidirectionalLSTMOpModel : public SingleOpModel {
       bw_input_gate_bias_ = AddInput(TensorType_FLOAT32);
     }
     bw_forget_gate_bias_ = AddInput(TensorType_FLOAT32);
-    bw_cell_bias_ = AddInput(TensorType_FLOAT32);
+    bw_cell_gate_bias_ = AddInput(TensorType_FLOAT32);
     bw_output_gate_bias_ = AddInput(TensorType_FLOAT32);
 
     if (use_projection_weights) {
@@ -288,8 +288,8 @@ class BidirectionalLSTMOpModel : public SingleOpModel {
   }
 
   void SetCellBias(const std::vector<float>& f) {
-    PopulateTensor(fw_cell_bias_, f);
-    PopulateTensor(bw_cell_bias_, f);
+    PopulateTensor(fw_cell_gate_bias_, f);
+    PopulateTensor(bw_cell_gate_bias_, f);
   }
 
   void SetOutputGateBias(const std::vector<float>& f) {
@@ -364,7 +364,7 @@ class BidirectionalLSTMOpModel : public SingleOpModel {
 
   int fw_input_gate_bias_;
   int fw_forget_gate_bias_;
-  int fw_cell_bias_;
+  int fw_cell_gate_bias_;
   int fw_output_gate_bias_;
 
   int fw_projection_weights_;
@@ -386,7 +386,7 @@ class BidirectionalLSTMOpModel : public SingleOpModel {
 
   int bw_input_gate_bias_;
   int bw_forget_gate_bias_;
-  int bw_cell_bias_;
+  int bw_cell_gate_bias_;
   int bw_output_gate_bias_;
 
   int bw_projection_weights_;
@@ -467,7 +467,7 @@ TEST_P(LSTMOpTest, BlackBoxTestNoCifgNoPeepholeNoProjectionNoClipping) {
 
           {n_cell},  // input_gate_bias tensor
           {n_cell},  // forget_gate_bias tensor
-          {n_cell},  // cell_bias tensor
+          {n_cell},  // cell_gate_bias tensor
           {n_cell},  // output_gate_bias tensor
 
           {0, 0},  // projection_weight tensor
@@ -490,7 +490,7 @@ TEST_P(LSTMOpTest, BlackBoxTestNoCifgNoPeepholeNoProjectionNoClipping) {
 
           {n_cell},  // input_gate_bias tensor
           {n_cell},  // forget_gate_bias tensor
-          {n_cell},  // cell_bias tensor
+          {n_cell},  // cell_gate_bias tensor
           {n_cell},  // output_gate_bias tensor
 
           {0, 0},  // projection_weight tensor
@@ -633,7 +633,7 @@ TEST_P(LSTMOpTest, BlackBoxTestMergedOutput) {
 
           {n_cell},  // input_gate_bias tensor
           {n_cell},  // forget_gate_bias tensor
-          {n_cell},  // cell_bias tensor
+          {n_cell},  // cell_gate_bias tensor
           {n_cell},  // output_gate_bias tensor
 
           {0, 0},  // projection_weight tensor
@@ -656,7 +656,7 @@ TEST_P(LSTMOpTest, BlackBoxTestMergedOutput) {
 
           {n_cell},  // input_gate_bias tensor
           {n_cell},  // forget_gate_bias tensor
-          {n_cell},  // cell_bias tensor
+          {n_cell},  // cell_gate_bias tensor
           {n_cell},  // output_gate_bias tensor
 
           {0, 0},  // projection_weight tensor
@@ -796,7 +796,7 @@ TEST(LSTMOpTest, BlackBoxTestNoCifgNoPeepholeNoProjectionNoClippingReverse) {
 
           {n_cell},  // input_gate_bias tensor
           {n_cell},  // forget_gate_bias tensor
-          {n_cell},  // cell_bias tensor
+          {n_cell},  // cell_gate_bias tensor
           {n_cell},  // output_gate_bias tensor
 
           {0, 0},  // projection_weight tensor
@@ -819,7 +819,7 @@ TEST(LSTMOpTest, BlackBoxTestNoCifgNoPeepholeNoProjectionNoClippingReverse) {
 
           {n_cell},  // input_gate_bias tensor
           {n_cell},  // forget_gate_bias tensor
-          {n_cell},  // cell_bias tensor
+          {n_cell},  // cell_gate_bias tensor
           {n_cell},  // output_gate_bias tensor
 
           {0, 0},  // projection_weight tensor
@@ -956,7 +956,7 @@ TEST(LSTMOpTest, BlackBoxTestWithCifgWithPeepholeNoProjectionNoClipping) {
 
           {0},       // input_gate_bias tensor
           {n_cell},  // forget_gate_bias tensor
-          {n_cell},  // cell_bias tensor
+          {n_cell},  // cell_gate_bias tensor
           {n_cell},  // output_gate_bias tensor
 
           {0, 0},  // projection_weight tensor
@@ -978,7 +978,7 @@ TEST(LSTMOpTest, BlackBoxTestWithCifgWithPeepholeNoProjectionNoClipping) {
 
           {0},       // input_gate_bias tensor
           {n_cell},  // forget_gate_bias tensor
-          {n_cell},  // cell_bias tensor
+          {n_cell},  // cell_gate_bias tensor
           {n_cell},  // output_gate_bias tensor
 
           {0, 0},  // projection_weight tensor
@@ -1107,7 +1107,7 @@ TEST(LSTMOpTest,
 
           {0},       // input_gate_bias tensor
           {n_cell},  // forget_gate_bias tensor
-          {n_cell},  // cell_bias tensor
+          {n_cell},  // cell_gate_bias tensor
           {n_cell},  // output_gate_bias tensor
 
           {0, 0},  // projection_weight tensor
@@ -1129,7 +1129,7 @@ TEST(LSTMOpTest,
 
           {0},       // input_gate_bias tensor
           {n_cell},  // forget_gate_bias tensor
-          {n_cell},  // cell_bias tensor
+          {n_cell},  // cell_gate_bias tensor
           {n_cell},  // output_gate_bias tensor
 
           {0, 0},  // projection_weight tensor
@@ -1258,7 +1258,7 @@ TEST(LSTMOpTest, BlackBoxTestWithPeepholeWithProjectionNoClipping) {
 
           {n_cell},  // input_gate_bias tensor
           {n_cell},  // forget_gate_bias tensor
-          {n_cell},  // cell_bias tensor
+          {n_cell},  // cell_gate_bias tensor
           {n_cell},  // output_gate_bias tensor
 
           {n_output, n_cell},  // projection_weight tensor
@@ -1280,7 +1280,7 @@ TEST(LSTMOpTest, BlackBoxTestWithPeepholeWithProjectionNoClipping) {
 
           {n_cell},  // input_gate_bias tensor
           {n_cell},  // forget_gate_bias tensor
-          {n_cell},  // cell_bias tensor
+          {n_cell},  // cell_gate_bias tensor
           {n_cell},  // output_gate_bias tensor
 
           {n_output, n_cell},  // projection_weight tensor
@@ -1961,7 +1961,7 @@ TEST(LSTMOpTest, BlackBoxTestWithPeepholeWithProjectionNoClippingBatchMajor) {
 
           {n_cell},  // input_gate_bias tensor
           {n_cell},  // forget_gate_bias tensor
-          {n_cell},  // cell_bias tensor
+          {n_cell},  // cell_gate_bias tensor
           {n_cell},  // output_gate_bias tensor
 
           {n_output, n_cell},  // projection_weight tensor
@@ -1983,7 +1983,7 @@ TEST(LSTMOpTest, BlackBoxTestWithPeepholeWithProjectionNoClippingBatchMajor) {
 
           {n_cell},  // input_gate_bias tensor
           {n_cell},  // forget_gate_bias tensor
-          {n_cell},  // cell_bias tensor
+          {n_cell},  // cell_gate_bias tensor
           {n_cell},  // output_gate_bias tensor
 
           {n_output, n_cell},  // projection_weight tensor
@@ -2667,7 +2667,7 @@ TEST_P(LSTMOpTest, BlackBoxTestWithAuxInputZeroAuxWeight) {
 
           {n_cell},  // input_gate_bias tensor
           {n_cell},  // forget_gate_bias tensor
-          {n_cell},  // cell_bias tensor
+          {n_cell},  // cell_gate_bias tensor
           {n_cell},  // output_gate_bias tensor
 
           {0, 0},  // projection_weight tensor
@@ -2690,7 +2690,7 @@ TEST_P(LSTMOpTest, BlackBoxTestWithAuxInputZeroAuxWeight) {
 
           {n_cell},  // input_gate_bias tensor
           {n_cell},  // forget_gate_bias tensor
-          {n_cell},  // cell_bias tensor
+          {n_cell},  // cell_gate_bias tensor
           {n_cell},  // output_gate_bias tensor
 
           {0, 0},  // projection_weight tensor
@@ -2841,7 +2841,7 @@ TEST_P(LSTMOpTest, BlackBoxTestWithAuxInput) {
 
           {n_cell},  // input_gate_bias tensor
           {n_cell},  // forget_gate_bias tensor
-          {n_cell},  // cell_bias tensor
+          {n_cell},  // cell_gate_bias tensor
           {n_cell},  // output_gate_bias tensor
 
           {0, 0},  // projection_weight tensor
@@ -2864,7 +2864,7 @@ TEST_P(LSTMOpTest, BlackBoxTestWithAuxInput) {
 
           {n_cell},  // input_gate_bias tensor
           {n_cell},  // forget_gate_bias tensor
-          {n_cell},  // cell_bias tensor
+          {n_cell},  // cell_gate_bias tensor
           {n_cell},  // output_gate_bias tensor
 
           {0, 0},  // projection_weight tensor
diff --git a/tensorflow/lite/kernels/cast.cc b/tensorflow/lite/kernels/cast.cc
index 415f1270328..ab95afa979f 100644
--- a/tensorflow/lite/kernels/cast.cc
+++ b/tensorflow/lite/kernels/cast.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/tensor.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/op_macros.h"
 
 namespace tflite {
 namespace ops {
@@ -67,8 +68,8 @@ void copyCast(const std::complex<float>* in, std::complex<float>* out,
 }
 
 template <typename FromT>
-TfLiteStatus copyToTensor(const FromT* in, TfLiteTensor* out,
-                          int num_elements) {
+TfLiteStatus copyToTensor(TfLiteContext* context, const FromT* in,
+                          TfLiteTensor* out, int num_elements) {
   switch (out->type) {
     case kTfLiteInt64:
       copyCast(in, out->data.i64, num_elements);
@@ -91,7 +92,7 @@ TfLiteStatus copyToTensor(const FromT* in, TfLiteTensor* out,
       break;
     default:
       // Unsupported type.
-      return kTfLiteError;
+      TF_LITE_UNSUPPORTED_TYPE(context, out->type, "Cast");
   }
   return kTfLiteOk;
 }
@@ -103,22 +104,23 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, num_elements, NumElements(output));
   switch (input->type) {
     case kTfLiteInt64:
-      return copyToTensor(input->data.i64, output, num_elements);
+      return copyToTensor(context, input->data.i64, output, num_elements);
     case kTfLiteInt32:
-      return copyToTensor(input->data.i32, output, num_elements);
+      return copyToTensor(context, input->data.i32, output, num_elements);
     case kTfLiteUInt8:
-      return copyToTensor(input->data.uint8, output, num_elements);
+      return copyToTensor(context, input->data.uint8, output, num_elements);
     case kTfLiteFloat32:
-      return copyToTensor(GetTensorData<float>(input), output, num_elements);
+      return copyToTensor(context, GetTensorData<float>(input), output,
+                          num_elements);
     case kTfLiteBool:
-      return copyToTensor(input->data.b, output, num_elements);
+      return copyToTensor(context, input->data.b, output, num_elements);
     case kTfLiteComplex64:
       return copyToTensor(
-          reinterpret_cast<std::complex<float>*>(input->data.c64), output,
-          num_elements);
+          context, reinterpret_cast<std::complex<float>*>(input->data.c64),
+          output, num_elements);
     default:
       // Unsupported type.
-      return kTfLiteError;
+      TF_LITE_UNSUPPORTED_TYPE(context, input->type, "Cast");
   }
   return kTfLiteOk;
 }
diff --git a/tensorflow/lite/kernels/ceil.cc b/tensorflow/lite/kernels/ceil.cc
index d8c6eaad7a4..95c660f3376 100644
--- a/tensorflow/lite/kernels/ceil.cc
+++ b/tensorflow/lite/kernels/ceil.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/tensor.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/op_macros.h"
 
 namespace tflite {
 namespace ops {
@@ -41,6 +42,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   const TfLiteTensor* input = GetInput(context, node, kInputTensor);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  if (input->type != kTfLiteFloat32) {
+    TF_LITE_UNSUPPORTED_TYPE(context, input->type, "Ceil");
+  }
 
   optimized_ops::Ceil(GetTensorShape(input), GetTensorData<float>(input),
                       GetTensorShape(output), GetTensorData<float>(output));
diff --git a/tensorflow/lite/kernels/conv.cc b/tensorflow/lite/kernels/conv.cc
index 88765b2f9c4..81069de1abe 100644
--- a/tensorflow/lite/kernels/conv.cc
+++ b/tensorflow/lite/kernels/conv.cc
@@ -26,7 +26,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/kernels/internal/types.h"
 // b/131835803 forces us to include multithreaded_conv.h before optimized_ops.h
-#ifndef TFLITE_WITH_RUY_ONLY
+#ifndef TFLITE_WITH_RUY
 #include "tensorflow/lite/kernels/internal/optimized/multithreaded_conv.h"
 #endif
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
@@ -765,8 +765,8 @@ void EvalFloat(TfLiteContext* context, TfLiteNode* node,
       break;
     }
     case kMultithreadOptimized: {
-#ifdef TFLITE_WITH_RUY_ONLY
-      // See Register_CONV_2D: we should never be here when TFLITE_WITH_RUY_ONLY
+#ifdef TFLITE_WITH_RUY
+      // See Register_CONV_2D: we should never be here when TFLITE_WITH_RUY
       // was enabled. We #if out this code in order to get the corresponding
       // binary size benefits.
       TFLITE_DCHECK(false);
@@ -1051,8 +1051,8 @@ TfLiteRegistration* Register_CONVOLUTION_CBLAS_OPT() {
 TfLiteRegistration* Register_CONV_2D() {
 #if defined TFLITE_USE_APPLE_ACCELERATE_FOR_CONV
   return Register_CONVOLUTION_CBLAS_OPT();
-#elif defined TFLITE_WITH_RUY_ONLY
-  // TFLITE_WITH_RUY_ONLY optimizes the generic kernel type.
+#elif defined TFLITE_WITH_RUY
+  // TFLITE_WITH_RUY optimizes the generic kernel type.
   return Register_CONVOLUTION_GENERIC_OPT();
 #else
   return Register_CONVOLUTION_MULTITHREADED_OPT();
@@ -1063,8 +1063,8 @@ TfLiteRegistration* Register_CONV_2D() {
 // models only need the UINT8 type. TFLite's op registration mechanism doesn't
 // yet allow for more nuanced registration mechanisms.
 TfLiteRegistration* Register_CONV_2D_UINT8() {
-#if defined TFLITE_WITH_RUY_ONLY
-  // TFLITE_WITH_RUY_ONLY optimizes the generic kernel type.
+#if defined TFLITE_WITH_RUY
+  // TFLITE_WITH_RUY optimizes the generic kernel type.
   return Register_CONVOLUTION_GENERIC_OPT_UINT8();
 #else
   return Register_CONV_2D();
diff --git a/tensorflow/lite/kernels/conv_test.cc b/tensorflow/lite/kernels/conv_test.cc
index a1fd34eb1cb..ac78bc6b353 100644
--- a/tensorflow/lite/kernels/conv_test.cc
+++ b/tensorflow/lite/kernels/conv_test.cc
@@ -148,7 +148,7 @@ class ConvolutionOpModel : public BaseConvolutionOpModel<float> {
 const auto kKernelMap = new std::map<string, TfLiteRegistration*>({
     {"Reference", ops::builtin::Register_CONVOLUTION_REF()},
     {"GenericOptimized", ops::builtin::Register_CONVOLUTION_GENERIC_OPT()},
-#ifndef TFLITE_WITH_RUY_ONLY
+#ifndef TFLITE_WITH_RUY
     {"MultithreadedOptimized",
      ops::builtin::Register_CONVOLUTION_MULTITHREADED_OPT()},
 #endif
diff --git a/tensorflow/lite/kernels/cpu_backend_context.h b/tensorflow/lite/kernels/cpu_backend_context.h
index 19ef88bf8e3..124b9b849a2 100644
--- a/tensorflow/lite/kernels/cpu_backend_context.h
+++ b/tensorflow/lite/kernels/cpu_backend_context.h
@@ -56,7 +56,7 @@ class CpuBackendContext final : public TfLiteInternalBackendContext {
   // (see :cpu_backend_gemm), for now a CpuBackendContext always
   // stores both a gemmlowp context and a ruy context.
   // TODO(b/131416458): Once call sites all go through abstractions,
-  // elide what can be elided based on TFLITE_WITH_RUY_ONLY.
+  // elide what can be elided based on TFLITE_WITH_RUY.
   const std::unique_ptr<ruy::Context> ruy_context_;
   const std::unique_ptr<gemmlowp::GemmContext> gemmlowp_context_;
 
diff --git a/tensorflow/lite/kernels/cpu_backend_gemm.h b/tensorflow/lite/kernels/cpu_backend_gemm.h
index f4d20d8970a..a95c4d15a82 100644
--- a/tensorflow/lite/kernels/cpu_backend_gemm.h
+++ b/tensorflow/lite/kernels/cpu_backend_gemm.h
@@ -24,7 +24,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/cpu_backend_gemm_params.h"
 #include "tensorflow/lite/kernels/cpu_backend_gemm_ruy.h"
 
-#ifndef TFLITE_WITH_RUY_ONLY
+#ifndef TFLITE_WITH_RUY
 #include "tensorflow/lite/kernels/cpu_backend_gemm_eigen.h"
 #include "tensorflow/lite/kernels/cpu_backend_gemm_gemmlowp.h"
 #endif
@@ -42,7 +42,7 @@ template <typename LhsScalar, typename RhsScalar, typename AccumScalar,
 struct GemmImpl : detail::GemmImplUsingRuy<LhsScalar, RhsScalar, AccumScalar,
                                            DstScalar, quantization_flavor> {};
 
-#ifndef TFLITE_WITH_RUY_ONLY
+#ifndef TFLITE_WITH_RUY
 
 /* Specializations using gemmlowp */
 
@@ -82,7 +82,7 @@ template <>
 struct GemmImpl<float, float, float, float, QuantizationFlavor::kFloatingPoint>
     : detail::GemmImplUsingEigen {};
 
-#endif  // not TFLITE_WITH_RUY_ONLY
+#endif  // not TFLITE_WITH_RUY
 
 /* Public entry point */
 
diff --git a/tensorflow/lite/kernels/cpu_backend_gemm_custom_gemv.h b/tensorflow/lite/kernels/cpu_backend_gemm_custom_gemv.h
index 224f8ecea41..2712d7d2cd1 100644
--- a/tensorflow/lite/kernels/cpu_backend_gemm_custom_gemv.h
+++ b/tensorflow/lite/kernels/cpu_backend_gemm_custom_gemv.h
@@ -591,10 +591,10 @@ struct CustomGemvImpl<LhsScalar, RhsScalar, std::int32_t, DstScalar,
 // The float specialization below is unconditionally faster than ruy
 // because ruy does not currently have any Gemv path.
 // But it is not unconditionally faster than Eigen, which is what is used
-// unless TFLITE_WITH_RUY_ONLY is defined. Indeed, Eigen has decently efficient
+// unless TFLITE_WITH_RUY is defined. Indeed, Eigen has decently efficient
 // Gemv paths, and they may use AVX instructions, while the present
 // NEON intrinsics code maps at best to SSE4 on x86.
-#ifdef TFLITE_WITH_RUY_ONLY
+#ifdef TFLITE_WITH_RUY
 
 // We want to use fused multiply-add when it's available (that is, on A64
 // unconditionally and on A32 with VFPv4) because it's often faster, and
@@ -778,7 +778,7 @@ struct CustomGemvImpl<float, float, float, float,
   }
 };
 
-#endif  // TFLITE_WITH_RUY_ONLY
+#endif  // TFLITE_WITH_RUY
 
 #endif  // USE_NEON
 
diff --git a/tensorflow/lite/kernels/cpu_backend_gemm_eigen.cc b/tensorflow/lite/kernels/cpu_backend_gemm_eigen.cc
index 334baa5f7ed..ebde7a0b935 100644
--- a/tensorflow/lite/kernels/cpu_backend_gemm_eigen.cc
+++ b/tensorflow/lite/kernels/cpu_backend_gemm_eigen.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TFLITE_WITH_RUY_ONLY
+#ifndef TFLITE_WITH_RUY
 
 #include "tensorflow/lite/kernels/cpu_backend_gemm_eigen.h"
 
@@ -78,4 +78,4 @@ void GemmImplUsingEigen::Run(
 }  // namespace cpu_backend_gemm
 }  // namespace tflite
 
-#endif  // not TFLITE_WITH_RUY_ONLY
+#endif  // not TFLITE_WITH_RUY
diff --git a/tensorflow/lite/kernels/cpu_backend_gemm_eigen.h b/tensorflow/lite/kernels/cpu_backend_gemm_eigen.h
index 3a7fd9df624..bd4733dcfae 100644
--- a/tensorflow/lite/kernels/cpu_backend_gemm_eigen.h
+++ b/tensorflow/lite/kernels/cpu_backend_gemm_eigen.h
@@ -16,7 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_KERNELS_CPU_BACKEND_GEMM_EIGEN_H_
 #define TENSORFLOW_LITE_KERNELS_CPU_BACKEND_GEMM_EIGEN_H_
 
-#ifndef TFLITE_WITH_RUY_ONLY
+#ifndef TFLITE_WITH_RUY
 
 #include "tensorflow/lite/kernels/cpu_backend_context.h"
 #include "tensorflow/lite/kernels/cpu_backend_gemm_params.h"
@@ -37,6 +37,6 @@ struct GemmImplUsingEigen {
 }  // namespace cpu_backend_gemm
 }  // namespace tflite
 
-#endif  // not TFLITE_WITH_RUY_ONLY
+#endif  // not TFLITE_WITH_RUY
 
 #endif  // TENSORFLOW_LITE_KERNELS_CPU_BACKEND_GEMM_EIGEN_H_
diff --git a/tensorflow/lite/kernels/cpu_backend_gemm_gemmlowp.h b/tensorflow/lite/kernels/cpu_backend_gemm_gemmlowp.h
index 77d37aac291..b7926679ae4 100644
--- a/tensorflow/lite/kernels/cpu_backend_gemm_gemmlowp.h
+++ b/tensorflow/lite/kernels/cpu_backend_gemm_gemmlowp.h
@@ -19,7 +19,7 @@ limitations under the License.
 #include <tuple>
 
 #include "tensorflow/lite/kernels/internal/compatibility.h"
-#ifndef TFLITE_WITH_RUY_ONLY
+#ifndef TFLITE_WITH_RUY
 
 #include <cstdint>
 #include <type_traits>
@@ -190,6 +190,6 @@ struct GemmImplUsingGemmlowp<LhsScalar, RhsScalar, AccumScalar, DstScalar,
 }  // namespace cpu_backend_gemm
 }  // namespace tflite
 
-#endif  // not TFLITE_WITH_RUY_ONLY
+#endif  // not TFLITE_WITH_RUY
 
 #endif  // TENSORFLOW_LITE_KERNELS_CPU_BACKEND_GEMM_GEMMLOWP_H_
diff --git a/tensorflow/lite/kernels/cpu_backend_threadpool.h b/tensorflow/lite/kernels/cpu_backend_threadpool.h
index 60a5ebfde29..39eafd51d6a 100644
--- a/tensorflow/lite/kernels/cpu_backend_threadpool.h
+++ b/tensorflow/lite/kernels/cpu_backend_threadpool.h
@@ -19,7 +19,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/cpu_backend_context.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
 
-#ifdef TFLITE_WITH_RUY_ONLY
+#ifdef TFLITE_WITH_RUY
 #include "ruy/context.h"  // from @ruy
 #include "ruy/thread_pool.h"  // from @ruy
 #else
@@ -29,7 +29,7 @@ limitations under the License.
 namespace tflite {
 namespace cpu_backend_threadpool {
 
-#ifdef TFLITE_WITH_RUY_ONLY
+#ifdef TFLITE_WITH_RUY
 
 using Task = ruy::Task;
 
@@ -41,7 +41,7 @@ void Execute(int tasks_count, TaskType* tasks,
       tasks_count, tasks);
 }
 
-#else  // not TFLITE_WITH_RUY_ONLY
+#else  // not TFLITE_WITH_RUY
 
 using Task = gemmlowp::Task;
 
diff --git a/tensorflow/lite/kernels/elementwise.cc b/tensorflow/lite/kernels/elementwise.cc
index 1b91244af33..61c6aeaa811 100644
--- a/tensorflow/lite/kernels/elementwise.cc
+++ b/tensorflow/lite/kernels/elementwise.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/tensor.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/op_macros.h"
 
 namespace tflite {
 namespace ops {
@@ -39,17 +40,15 @@ bool IsLogicalSupportedType(const TfLiteType type) {
 }
 
 typedef bool (*IsSupportedType)(TfLiteType);
-template <IsSupportedType>
+template <IsSupportedType is_supported_type, const char* op_name>
 TfLiteStatus GenericPrepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
   const TfLiteTensor* input = GetInput(context, node, 0);
   TfLiteTensor* output = GetOutput(context, node, 0);
   TF_LITE_ENSURE_TYPES_EQ(context, input->type, output->type);
-  if (!IsSupportedType(input->type)) {
-    context->ReportError(context, "Current data type %d is not supported.",
-                         input->type);
-    return kTfLiteError;
+  if (!is_supported_type(input->type)) {
+    TF_LITE_UNSUPPORTED_TYPE(context, input->type, op_name);
   }
   return context->ResizeTensor(context, output,
                                TfLiteIntArrayCopy(input->dims));
@@ -112,13 +111,23 @@ TfLiteStatus LogicalNotEval(TfLiteContext* context, TfLiteNode* node) {
   return EvalLogical(context, node, [](bool v) { return !v; });
 }
 
+constexpr char kAbsName[] = "Abs";
+constexpr char kSinName[] = "Sin";
+constexpr char kCosName[] = "Cos";
+constexpr char kLogName[] = "Log";
+constexpr char kSqrtName[] = "Sqrt";
+constexpr char kRsqrtName[] = "Rsqrt";
+constexpr char kSquareName[] = "Square";
+constexpr char kNotName[] = "Not";
+
 }  // namespace
 }  // namespace elementwise
 
 TfLiteRegistration* Register_ABS() {
   static TfLiteRegistration r = {
       /*init=*/nullptr, /*free=*/nullptr,
-      elementwise::GenericPrepare<elementwise::IsNumericSupportedType>,
+      elementwise::GenericPrepare<elementwise::IsNumericSupportedType,
+                                  elementwise::kAbsName>,
       elementwise::AbsEval};
   return &r;
 }
@@ -126,7 +135,8 @@ TfLiteRegistration* Register_ABS() {
 TfLiteRegistration* Register_SIN() {
   static TfLiteRegistration r = {
       /*init=*/nullptr, /*free=*/nullptr,
-      elementwise::GenericPrepare<elementwise::IsNumericSupportedType>,
+      elementwise::GenericPrepare<elementwise::IsNumericSupportedType,
+                                  elementwise::kSinName>,
       elementwise::SinEval};
   return &r;
 }
@@ -134,7 +144,8 @@ TfLiteRegistration* Register_SIN() {
 TfLiteRegistration* Register_COS() {
   static TfLiteRegistration r = {
       /*init=*/nullptr, /*free=*/nullptr,
-      elementwise::GenericPrepare<elementwise::IsNumericSupportedType>,
+      elementwise::GenericPrepare<elementwise::IsNumericSupportedType,
+                                  elementwise::kCosName>,
       elementwise::CosEval};
   return &r;
 }
@@ -142,7 +153,8 @@ TfLiteRegistration* Register_COS() {
 TfLiteRegistration* Register_LOG() {
   static TfLiteRegistration r = {
       /*init=*/nullptr, /*free=*/nullptr,
-      elementwise::GenericPrepare<elementwise::IsNumericSupportedType>,
+      elementwise::GenericPrepare<elementwise::IsNumericSupportedType,
+                                  elementwise::kLogName>,
       elementwise::LogEval};
   return &r;
 }
@@ -150,7 +162,8 @@ TfLiteRegistration* Register_LOG() {
 TfLiteRegistration* Register_SQRT() {
   static TfLiteRegistration r = {
       /*init=*/nullptr, /*free=*/nullptr,
-      elementwise::GenericPrepare<elementwise::IsNumericSupportedType>,
+      elementwise::GenericPrepare<elementwise::IsNumericSupportedType,
+                                  elementwise::kSqrtName>,
       elementwise::SqrtEval};
   return &r;
 }
@@ -158,7 +171,8 @@ TfLiteRegistration* Register_SQRT() {
 TfLiteRegistration* Register_RSQRT() {
   static TfLiteRegistration r = {
       /*init=*/nullptr, /*free=*/nullptr,
-      elementwise::GenericPrepare<elementwise::IsNumericSupportedType>,
+      elementwise::GenericPrepare<elementwise::IsNumericSupportedType,
+                                  elementwise::kRsqrtName>,
       elementwise::RsqrtEval};
   return &r;
 }
@@ -166,7 +180,8 @@ TfLiteRegistration* Register_RSQRT() {
 TfLiteRegistration* Register_SQUARE() {
   static TfLiteRegistration r = {
       /*init=*/nullptr, /*free=*/nullptr,
-      elementwise::GenericPrepare<elementwise::IsNumericSupportedType>,
+      elementwise::GenericPrepare<elementwise::IsNumericSupportedType,
+                                  elementwise::kSquareName>,
       elementwise::SquareEval};
   return &r;
 }
@@ -174,7 +189,8 @@ TfLiteRegistration* Register_SQUARE() {
 TfLiteRegistration* Register_LOGICAL_NOT() {
   static TfLiteRegistration r = {
       /*init=*/nullptr, /*free=*/nullptr,
-      elementwise::GenericPrepare<elementwise::IsLogicalSupportedType>,
+      elementwise::GenericPrepare<elementwise::IsLogicalSupportedType,
+                                  elementwise::kNotName>,
       elementwise::LogicalNotEval};
   return &r;
 }
diff --git a/tensorflow/lite/kernels/fully_connected.cc b/tensorflow/lite/kernels/fully_connected.cc
index 8b7a7832dbb..9cbbcae9c51 100644
--- a/tensorflow/lite/kernels/fully_connected.cc
+++ b/tensorflow/lite/kernels/fully_connected.cc
@@ -312,7 +312,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   if (!is_pie && !is_hybrid) {
     TF_LITE_ENSURE(context, params->activation == kTfLiteActNone ||
                                 params->activation == kTfLiteActRelu ||
-                                params->activation == kTfLiteActRelu1 ||
+                                params->activation == kTfLiteActReluN1To1 ||
                                 params->activation == kTfLiteActRelu6);
   }
   return PrepareImpl(context, node);
diff --git a/tensorflow/lite/kernels/internal/optimized/batch_matmul.h b/tensorflow/lite/kernels/internal/optimized/batch_matmul.h
index 24b5012304f..5e622154d60 100644
--- a/tensorflow/lite/kernels/internal/optimized/batch_matmul.h
+++ b/tensorflow/lite/kernels/internal/optimized/batch_matmul.h
@@ -272,6 +272,112 @@ inline void BatchMatMul(const RuntimeShape& lhs_shape, const int8_t* lhs_data,
   }
 }
 
+inline void BatchMatMul(const FullyConnectedParams& params,
+                        const RuntimeShape& lhs_shape, const int8_t* lhs_data,
+                        const RuntimeShape& rhs_shape, const int8_t* rhs_data,
+                        const RuntimeShape& output_shape, int8_t* output_data,
+                        CpuBackendContext* context) {
+  using ::tflite::cpu_backend_gemm::Gemm;
+  using ::tflite::cpu_backend_gemm::GemmParams;
+  using ::tflite::cpu_backend_gemm::MatrixParams;
+
+  const RuntimeShape extended_lhs_shape =
+      RuntimeShape::ExtendedShape(5, lhs_shape);
+  const RuntimeShape extended_rhs_shape =
+      RuntimeShape::ExtendedShape(5, rhs_shape);
+
+  // Determine which dimension is the broadcast dimension.
+  auto broadcast_dim = [](int lhs_dim, int rhs_dim) {
+    if (lhs_dim == rhs_dim) return lhs_dim;
+    if (lhs_dim == 1) return rhs_dim;
+    TFLITE_DCHECK_EQ(rhs_dim, 1);
+    return lhs_dim;
+  };
+
+  // Compute the "extent" for iterating on this dimension.
+  // If we are broadcasting, then don't advance (i.e return 0).
+  auto extent = [](const RuntimeShape& shape, int x) {
+    if (shape.Dims(x) == 1) {
+      return 0;
+    }
+    int prod = 1;
+    for (int i = x + 1; i < shape.DimensionsCount(); ++i) {
+      prod *= shape.Dims(i);
+    }
+    return prod;
+  };
+
+  const int batch_dim0 =
+      broadcast_dim(extended_lhs_shape.Dims(0), extended_rhs_shape.Dims(0));
+  const int batch_dim1 =
+      broadcast_dim(extended_lhs_shape.Dims(1), extended_rhs_shape.Dims(1));
+  const int batch_dim2 =
+      broadcast_dim(extended_lhs_shape.Dims(2), extended_rhs_shape.Dims(2));
+
+  const int lhs_ext0 = extent(extended_lhs_shape, 0);
+  const int lhs_ext1 = extent(extended_lhs_shape, 1);
+  const int lhs_ext2 = extent(extended_lhs_shape, 2);
+  const int rhs_ext0 = extent(extended_rhs_shape, 0);
+  const int rhs_ext1 = extent(extended_rhs_shape, 1);
+  const int rhs_ext2 = extent(extended_rhs_shape, 2);
+
+  // Set params for each matrix multiply.
+  const int lhs_rows = extended_lhs_shape.Dims(3);
+  const int rhs_cols = extended_rhs_shape.Dims(4);
+  const int accum_depth = extended_lhs_shape.Dims(4);
+
+  const int32 input_offset = params.input_offset;
+  const int32 filter_offset = params.weights_offset;
+  const int32 output_offset = params.output_offset;
+  const int32 output_multiplier = params.output_multiplier;
+  const int output_shift = params.output_shift;
+  const int32 output_activation_min = params.quantized_activation_min;
+  const int32 output_activation_max = params.quantized_activation_max;
+  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+
+  MatrixParams<int8_t> lhs_params;
+  lhs_params.order = cpu_backend_gemm::Order::kRowMajor;
+  lhs_params.rows = lhs_rows;
+  lhs_params.cols = accum_depth;
+  lhs_params.zero_point = -filter_offset;
+
+  MatrixParams<int8_t> rhs_params;
+  rhs_params.order = cpu_backend_gemm::Order::kColMajor;
+  rhs_params.rows = accum_depth;
+  rhs_params.cols = rhs_cols;
+  rhs_params.zero_point = -input_offset;
+
+  MatrixParams<int8_t> dst_params;
+  dst_params.order = cpu_backend_gemm::Order::kColMajor;
+  dst_params.rows = lhs_rows;
+  dst_params.cols = rhs_cols;
+  dst_params.zero_point = output_offset;
+
+  for (int b0 = 0; b0 < batch_dim0; ++b0) {
+    const int8_t* lhs_ptr0 = lhs_data + (b0 * lhs_ext0);
+    const int8_t* rhs_ptr0 = rhs_data + (b0 * rhs_ext0);
+    for (int b1 = 0; b1 < batch_dim1; ++b1) {
+      const int8_t* lhs_ptr1 = lhs_ptr0 + b1 * lhs_ext1;
+      const int8_t* rhs_ptr1 = rhs_ptr0 + b1 * rhs_ext1;
+      for (int b2 = 0; b2 < batch_dim2; ++b2) {
+        const int8_t* lhs_ptr2 = lhs_ptr1 + b2 * lhs_ext2;
+        const int8_t* rhs_ptr2 = rhs_ptr1 + b2 * rhs_ext2;
+        int8_t* out_ptr = output_data + ((b0 * batch_dim1 * batch_dim2) +
+                                         b1 * batch_dim2 + b2) *
+                                            lhs_rows * rhs_cols;
+
+        GemmParams<int32_t, int8_t> gemm_params;
+        gemm_params.clamp_min = output_activation_min;
+        gemm_params.clamp_max = output_activation_max;
+        gemm_params.multiplier_fixedpoint = output_multiplier;
+        gemm_params.multiplier_exponent = output_shift;
+        cpu_backend_gemm::Gemm(lhs_params, lhs_ptr2, rhs_params, rhs_ptr2,
+                               dst_params, out_ptr, gemm_params, context);
+      }
+    }
+  }
+}
+
 }  // namespace optimized_ops
 }  // namespace tflite
 
diff --git a/tensorflow/lite/kernels/internal/optimized/depthwiseconv_multithread.h b/tensorflow/lite/kernels/internal/optimized/depthwiseconv_multithread.h
index 0e13222b28a..7d8838a076e 100644
--- a/tensorflow/lite/kernels/internal/optimized/depthwiseconv_multithread.h
+++ b/tensorflow/lite/kernels/internal/optimized/depthwiseconv_multithread.h
@@ -132,7 +132,7 @@ inline void DepthwiseConv(const DepthwiseParams& params,
   int thread_count = HowManyConvThreads(output_shape, filter_shape);
   const int max_threads = cpu_backend_context->max_num_threads();
   thread_count = std::max(1, std::min(thread_count, max_threads));
-#ifndef TFLITE_WITH_RUY_ONLY
+#ifndef TFLITE_WITH_RUY
   // Cap the number of threads to 2 for float path to avoid regression in
   // performance (b/132294857).
   if (std::is_floating_point<T>::value) {
diff --git a/tensorflow/lite/kernels/internal/reference/batch_matmul.h b/tensorflow/lite/kernels/internal/reference/batch_matmul.h
index 1394bd9da64..05caefaca5d 100644
--- a/tensorflow/lite/kernels/internal/reference/batch_matmul.h
+++ b/tensorflow/lite/kernels/internal/reference/batch_matmul.h
@@ -217,6 +217,99 @@ inline void BatchMatMul(const RuntimeShape& lhs_shape, const int8_t* lhs_data,
   }
 }
 
+inline void BatchMatMul(const FullyConnectedParams& params,
+                        const RuntimeShape& lhs_shape, const int8_t* lhs_data,
+                        const RuntimeShape& rhs_shape, const int8_t* rhs_data,
+                        const RuntimeShape& output_shape, int8_t* output_data) {
+  const RuntimeShape extended_lhs_shape =
+      RuntimeShape::ExtendedShape(5, lhs_shape);
+  const RuntimeShape extended_rhs_shape =
+      RuntimeShape::ExtendedShape(5, rhs_shape);
+
+  // Determine which dimension is the broadcast dimension.
+  auto broadcast_dim = [](int lhs_dim, int rhs_dim) {
+    if (lhs_dim == rhs_dim) return lhs_dim;
+    if (lhs_dim == 1) return rhs_dim;
+    TFLITE_DCHECK_EQ(rhs_dim, 1);
+    return lhs_dim;
+  };
+
+  // Compute the "extent" for iterating on this dimension.
+  // If we are broadcasting, then don't advance (i.e return 0).
+  auto extent = [](const RuntimeShape& shape, int x) {
+    if (shape.Dims(x) == 1) {
+      return 0;
+    }
+    int prod = 1;
+    for (int i = x + 1; i < shape.DimensionsCount(); ++i) {
+      prod *= shape.Dims(i);
+    }
+    return prod;
+  };
+
+  const int batch_dim0 =
+      broadcast_dim(extended_lhs_shape.Dims(0), extended_rhs_shape.Dims(0));
+  const int batch_dim1 =
+      broadcast_dim(extended_lhs_shape.Dims(1), extended_rhs_shape.Dims(1));
+  const int batch_dim2 =
+      broadcast_dim(extended_lhs_shape.Dims(2), extended_rhs_shape.Dims(2));
+
+  const int lhs_ext0 = extent(extended_lhs_shape, 0);
+  const int lhs_ext1 = extent(extended_lhs_shape, 1);
+  const int lhs_ext2 = extent(extended_lhs_shape, 2);
+  const int rhs_ext0 = extent(extended_rhs_shape, 0);
+  const int rhs_ext1 = extent(extended_rhs_shape, 1);
+  const int rhs_ext2 = extent(extended_rhs_shape, 2);
+
+  // Set params for each matrix multiply.
+  const int lhs_rows = extended_lhs_shape.Dims(3);
+  const int rhs_cols = extended_rhs_shape.Dims(4);
+  const int accum_depth = extended_lhs_shape.Dims(4);
+
+  const int32 input_offset = params.input_offset;
+  const int32 filter_offset = params.weights_offset;
+  const int32 output_offset = params.output_offset;
+  const int32 output_multiplier = params.output_multiplier;
+  const int output_shift = params.output_shift;
+  const int32 output_activation_min = params.quantized_activation_min;
+  const int32 output_activation_max = params.quantized_activation_max;
+  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+
+  for (int b0 = 0; b0 < batch_dim0; ++b0) {
+    const int8_t* lhs_ptr0 = lhs_data + (b0 * lhs_ext0);
+    const int8_t* rhs_ptr0 = rhs_data + (b0 * rhs_ext0);
+    for (int b1 = 0; b1 < batch_dim1; ++b1) {
+      const int8_t* lhs_ptr1 = lhs_ptr0 + b1 * lhs_ext1;
+      const int8_t* rhs_ptr1 = rhs_ptr0 + b1 * rhs_ext1;
+      for (int b2 = 0; b2 < batch_dim2; ++b2) {
+        const int8_t* lhs_ptr2 = lhs_ptr1 + b2 * lhs_ext2;
+        const int8_t* rhs_ptr2 = rhs_ptr1 + b2 * rhs_ext2;
+        int8_t* out_ptr = output_data + ((b0 * batch_dim1 * batch_dim2) +
+                                         b1 * batch_dim2 + b2) *
+                                            lhs_rows * rhs_cols;
+
+        for (int j = 0; j < rhs_cols; ++j) {
+          for (int i = 0; i < lhs_rows; ++i) {
+            int32_t total = 0;
+            for (int k = 0; k < accum_depth; ++k) {
+              int32 lhs_val = lhs_ptr2[accum_depth * i + k];
+              int32 rhs_val = rhs_ptr2[accum_depth * j + k];
+              total += (lhs_val + filter_offset) * (rhs_val + input_offset);
+            }
+            total = MultiplyByQuantizedMultiplier(total, output_multiplier,
+                                                  output_shift);
+            total += output_offset;
+            total = std::max(total, output_activation_min);
+            total = std::min(total, output_activation_max);
+            const int idx = lhs_rows * j + i;
+            out_ptr[idx] = static_cast<int8_t>(total);
+          }
+        }
+      }
+    }
+  }
+}
+
 }  // namespace reference_ops
 }  // namespace tflite
 
diff --git a/tensorflow/lite/kernels/internal/reference/reference_ops.h b/tensorflow/lite/kernels/internal/reference/reference_ops.h
index e991a21e3bd..5208b21eb4d 100644
--- a/tensorflow/lite/kernels/internal/reference/reference_ops.h
+++ b/tensorflow/lite/kernels/internal/reference/reference_ops.h
@@ -1645,6 +1645,109 @@ inline void ResizeBilinear(const tflite::ResizeBilinearParams& op_params,
   }
 }
 
+inline void ComputeInterpolationValues(const int32 value, const int32 scale_10,
+                                       const bool half_pixel_centers,
+                                       int32 input_size, int32* scaled_value,
+                                       int32* lower_bound, int32* upper_bound) {
+  if (half_pixel_centers) {
+    *scaled_value = value * scale_10 + scale_10 / 2 - (1 << 9);
+  } else {
+    *scaled_value = value * scale_10;
+  }
+  *lower_bound = std::max(*scaled_value / (1 << 10), 0);
+  *upper_bound = std::min(*scaled_value / (1 << 10) + 1, input_size - 1);
+}
+
+// Same as above but takes int8 as input and output.
+inline void ResizeBilinear(const tflite::ResizeBilinearParams& op_params,
+                           const RuntimeShape& unextended_input_shape,
+                           const int8_t* input_data,
+                           const RuntimeShape& unextended_output_size_shape,
+                           const int32* output_size_data,
+                           const RuntimeShape& unextended_output_shape,
+                           int8_t* output_data) {
+  // If half_pixel_centers is True, align_corners must be False.
+  TFLITE_DCHECK(!op_params.half_pixel_centers || !op_params.align_corners);
+  TFLITE_DCHECK_LE(unextended_input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_output_size_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
+  const RuntimeShape input_shape =
+      RuntimeShape::ExtendedShape(4, unextended_input_shape);
+  const RuntimeShape output_size_shape =
+      RuntimeShape::ExtendedShape(4, unextended_output_size_shape);
+  const RuntimeShape output_shape =
+      RuntimeShape::ExtendedShape(4, unextended_output_shape);
+
+  const int32 batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int32 input_height = input_shape.Dims(1);
+  const int32 input_width = input_shape.Dims(2);
+  const int32 depth = MatchingDim(input_shape, 3, output_shape, 3);
+
+  TFLITE_DCHECK_EQ(output_size_shape.Dims(0), 1);
+  TFLITE_DCHECK_EQ(output_size_shape.Dims(1), 1);
+  TFLITE_DCHECK_EQ(output_size_shape.Dims(2), 1);
+  TFLITE_DCHECK_EQ(output_size_shape.Dims(3), 2);
+  const int32 output_height =
+      output_size_data[Offset(output_size_shape, 0, 0, 0, 0)];
+  const int32 output_width =
+      output_size_data[Offset(output_size_shape, 0, 0, 0, 1)];
+
+  int32 height_scale_10 =
+      ((1 << 10) * input_height + output_height / 2) / output_height;
+  int32 width_scale_10 =
+      ((1 << 10) * input_width + output_width / 2) / output_width;
+  if (op_params.align_corners && output_height > 1) {
+    height_scale_10 =
+        ((1 << 10) * (input_height - 1) + (output_height - 1) / 2) /
+        (output_height - 1);
+  }
+  if (op_params.align_corners && output_width > 1) {
+    width_scale_10 = ((1 << 10) * (input_width - 1) + (output_width - 1) / 2) /
+                     (output_width - 1);
+  }
+
+  for (int b = 0; b < batches; ++b) {
+    for (int y = 0; y < output_height; ++y) {
+      int32 input_y, y0, y1;
+      ComputeInterpolationValues(y, height_scale_10,
+                                 op_params.half_pixel_centers, input_height,
+                                 &input_y, &y0, &y1);
+      for (int x = 0; x < output_width; ++x) {
+        int32 input_x, x0, x1;
+        ComputeInterpolationValues(x, width_scale_10,
+                                   op_params.half_pixel_centers, input_width,
+                                   &input_x, &x0, &x1);
+        for (int c = 0; c < depth; ++c) {
+          const int64_t output_20_ll =
+              static_cast<int64_t>(
+                  input_data[Offset(input_shape, b, y0, x0, c)]) *
+              ((1 << 10) - (input_y - (1 << 10) * y0)) *
+              ((1 << 10) - (input_x - (1 << 10) * x0));
+          const int64_t output_20_lu =
+              static_cast<int64_t>(
+                  input_data[Offset(input_shape, b, y1, x0, c)]) *
+              (input_y - (1 << 10) * y0) *
+              ((1 << 10) - (input_x - (1 << 10) * x0));
+          const int64_t output_20_rl =
+              static_cast<int64_t>(
+                  input_data[Offset(input_shape, b, y0, x1, c)]) *
+              ((1 << 10) - (input_y - (1 << 10) * y0)) *
+              (input_x - (1 << 10) * x0);
+          const int64_t output_20_ru =
+              static_cast<int64_t>(
+                  input_data[Offset(input_shape, b, y1, x1, c)]) *
+              (input_y - (1 << 10) * y0) * (input_x - (1 << 10) * x0);
+          const int64_t output_20 =
+              output_20_ll + output_20_lu + output_20_rl + output_20_ru;
+          const int8_t interpolation =
+              static_cast<int8_t>((output_20 + (1 << 19)) / (1 << 20));
+          output_data[Offset(output_shape, b, y, x, c)] = interpolation;
+        }
+      }
+    }
+  }
+}
+
 template <typename T>
 inline void SpaceToBatchND(
     const SpaceToBatchParams& params,
diff --git a/tensorflow/lite/kernels/internal/tensor_utils.h b/tensorflow/lite/kernels/internal/tensor_utils.h
index e2af88d50e3..8c956c49f5f 100644
--- a/tensorflow/lite/kernels/internal/tensor_utils.h
+++ b/tensorflow/lite/kernels/internal/tensor_utils.h
@@ -587,7 +587,7 @@ inline void ApplyActivationToVector(const float* __restrict__ vector,
       return;
     case kTfLiteActRelu:
       return ApplyReluToVector(vector, v_size, result);
-    case kTfLiteActRelu1:
+    case kTfLiteActReluN1To1:
       return ApplyRelu1ToVector(vector, v_size, result);
     case kTfLiteActRelu6:
       return ApplyRelu6ToVector(vector, v_size, result);
diff --git a/tensorflow/lite/kernels/kernel_util.cc b/tensorflow/lite/kernels/kernel_util.cc
index 032726a7860..164aec3f224 100644
--- a/tensorflow/lite/kernels/kernel_util.cc
+++ b/tensorflow/lite/kernels/kernel_util.cc
@@ -188,7 +188,7 @@ void CalculateActivationRangeQuantizedImpl(TfLiteFusedActivation activation,
   } else if (activation == kTfLiteActRelu6) {
     *act_min = std::max(qmin, quantize(0.0));
     *act_max = std::min(qmax, quantize(6.0));
-  } else if (activation == kTfLiteActRelu1) {
+  } else if (activation == kTfLiteActReluN1To1) {
     *act_min = std::max(qmin, quantize(-1.0));
     *act_max = std::min(qmax, quantize(1.0));
   } else {
diff --git a/tensorflow/lite/kernels/kernel_util.h b/tensorflow/lite/kernels/kernel_util.h
index 6fc69fa1629..6bd6bb1c7ed 100644
--- a/tensorflow/lite/kernels/kernel_util.h
+++ b/tensorflow/lite/kernels/kernel_util.h
@@ -169,7 +169,7 @@ void CalculateActivationRange(TfLiteFusedActivation activation,
   } else if (activation == kTfLiteActRelu6) {
     *activation_min = 0;
     *activation_max = 6;
-  } else if (activation == kTfLiteActRelu1) {
+  } else if (activation == kTfLiteActReluN1To1) {
     *activation_min = -1;
     *activation_max = 1;
   } else {
diff --git a/tensorflow/lite/kernels/lstm.cc b/tensorflow/lite/kernels/lstm.cc
index 74caafbd0c7..803fbba4eae 100644
--- a/tensorflow/lite/kernels/lstm.cc
+++ b/tensorflow/lite/kernels/lstm.cc
@@ -68,24 +68,24 @@ TfLiteStatus PopulateQuantizedLstmParams8x8_16(
   const float cell_clip = params->cell_clip;
   const float proj_clip = params->proj_clip;
 
-  const TfLiteTensor* cell_tensor =
-      GetVariableInput(context, node, kInputCellStateTensor);
-  TF_LITE_ENSURE(context, cell_tensor != nullptr);
+  const TfLiteTensor* cell_state =
+      GetVariableInput(context, node, kCellStateTensor);
+  TF_LITE_ENSURE(context, cell_state != nullptr);
   const TfLiteTensor* output_tensor = GetOutput(context, node, kOutputTensor);
 
-  auto* cell_params =
-      static_cast<TfLiteAffineQuantization*>(cell_tensor->quantization.params);
+  auto* cell_state_params =
+      static_cast<TfLiteAffineQuantization*>(cell_state->quantization.params);
   auto* proj_params = static_cast<TfLiteAffineQuantization*>(
       output_tensor->quantization.params);
   if (cell_clip > 0.0) {
-    integer_lstm_param->quantized_cell_clip = static_cast<int32_t>(
-        std::min(std::max(cell_clip / cell_params->scale->data[0], -32768.0f),
-                 32767.0f));
+    integer_lstm_param->quantized_cell_clip = static_cast<int16_t>(std::min(
+        std::max(cell_clip / cell_state_params->scale->data[0], -32768.0f),
+        32767.0f));
   } else {
     integer_lstm_param->quantized_cell_clip = 0;
   }
   if (proj_clip > 0.0) {
-    integer_lstm_param->quantized_proj_clip = static_cast<int32_t>(std::min(
+    integer_lstm_param->quantized_proj_clip = static_cast<int8_t>(std::min(
         std::max(proj_clip / proj_params->scale->data[0], -128.0f), 127.0f));
   } else {
     integer_lstm_param->quantized_proj_clip = 0;
@@ -134,9 +134,9 @@ TfLiteStatus PopulateQuantizedLstmParams8x8_16(
   const TfLiteTensor* projection_weights =
       GetOptionalInputTensor(context, node, kProjectionWeightsTensor);
 
-  TfLiteTensor* activation_state =
-      GetVariableInput(context, node, kInputActivationStateTensor);
-  TF_LITE_ENSURE(context, activation_state != nullptr);
+  TfLiteTensor* output_state =
+      GetVariableInput(context, node, kOutputStateTensor);
+  TF_LITE_ENSURE(context, output_state != nullptr);
 
   // Since we have already checked that weights are all there or none, we can
   // check the existence of only one to get the condition.
@@ -182,12 +182,12 @@ TfLiteStatus PopulateQuantizedLstmParams8x8_16(
   float input_to_output_weight_scale = default_scale;
   float recurrent_to_output_weight_scale = default_scale;
   float cell_to_output_weight_scale = default_scale;
-  float proj_weight_scale = default_scale;
+  float projection_weight_scale = default_scale;
   float layer_norm_input_scale = default_scale;
   float layer_norm_forget_scale = default_scale;
   float layer_norm_cell_scale = default_scale;
   float layer_norm_output_scale = default_scale;
-  float activation_scale = default_scale;
+  float output_state_scale = default_scale;
   int cell_scale = 1;
 
   // Effective scales.
@@ -229,9 +229,9 @@ TfLiteStatus PopulateQuantizedLstmParams8x8_16(
   }
 
   if (use_projection) {
-    proj_weight_scale = projection_weights->params.scale;
+    projection_weight_scale = projection_weights->params.scale;
   }
-  activation_scale = activation_state->params.scale;
+  output_state_scale = output_state->params.scale;
 
   input_to_forget_weight_scale = input_to_forget_weights->params.scale;
   input_to_cell_weight_scale = input_to_cell_weights->params.scale;
@@ -240,12 +240,8 @@ TfLiteStatus PopulateQuantizedLstmParams8x8_16(
   recurrent_to_cell_weight_scale = recurrent_to_cell_weights->params.scale;
   recurrent_to_output_weight_scale = recurrent_to_output_weights->params.scale;
 
-  // Get cell state.
-  TfLiteTensor* cell_state =
-      GetVariableInput(context, node, kInputCellStateTensor);
-  TF_LITE_ENSURE(context, cell_state != nullptr);
+  // Check cell state (already used above)
   TF_LITE_ENSURE(context, CheckedLog2(cell_state->params.scale, &cell_scale));
-
   TF_LITE_ENSURE(context, cell_scale <= -9);
   integer_lstm_param->cell_scale = cell_scale;
   input_scale = input->params.scale;
@@ -255,31 +251,32 @@ TfLiteStatus PopulateQuantizedLstmParams8x8_16(
     effective_input_to_input_scale =
         input_to_input_weight_scale * input_scale / intermediate_scale[0];
     effective_recurrent_to_input_scale = recurrent_to_input_weight_scale *
-                                         activation_scale /
+                                         output_state_scale /
                                          intermediate_scale[0];
   }
   effective_input_to_forget_scale =
       input_to_forget_weight_scale * input_scale / intermediate_scale[1];
   effective_recurrent_to_forget_scale = recurrent_to_forget_weight_scale *
-                                        activation_scale /
+                                        output_state_scale /
                                         intermediate_scale[1];
 
   effective_input_to_cell_scale =
       input_to_cell_weight_scale * input_scale / intermediate_scale[2];
-  effective_recurrent_to_cell_scale =
-      recurrent_to_cell_weight_scale * activation_scale / intermediate_scale[2];
+  effective_recurrent_to_cell_scale = recurrent_to_cell_weight_scale *
+                                      output_state_scale /
+                                      intermediate_scale[2];
 
   effective_input_to_output_scale =
       input_to_output_weight_scale * input_scale / intermediate_scale[3];
   effective_recurrent_to_output_scale = recurrent_to_output_weight_scale *
-                                        activation_scale /
+                                        output_state_scale /
                                         intermediate_scale[3];
 
   effective_hidden_scale =
       std::pow(2, -15) / intermediate_scale[4] * std::pow(2, -15);
 
   effective_proj_scale =
-      proj_weight_scale * intermediate_scale[4] / activation_scale;
+      projection_weight_scale * intermediate_scale[4] / output_state_scale;
 
   if (use_peephole) {
     if (!use_cifg) {
@@ -410,7 +407,8 @@ TfLiteStatus PopulateQuantizedLstmParams8x8_8(
       GetOptionalInputTensor(context, node, kInputGateBiasTensor);
   const TfLiteTensor* forget_gate_bias =
       GetInput(context, node, kForgetGateBiasTensor);
-  const TfLiteTensor* cell_bias = GetInput(context, node, kCellGateBiasTensor);
+  const TfLiteTensor* cell_gate_bias =
+      GetInput(context, node, kCellGateBiasTensor);
   const TfLiteTensor* output_gate_bias =
       GetInput(context, node, kOutputGateBiasTensor);
 
@@ -419,11 +417,10 @@ TfLiteStatus PopulateQuantizedLstmParams8x8_8(
   const TfLiteTensor* projection_bias =
       GetOptionalInputTensor(context, node, kProjectionBiasTensor);
 
-  TfLiteTensor* activation_state =
-      GetVariableInput(context, node, kInputActivationStateTensor);
-  TF_LITE_ENSURE(context, activation_state != nullptr);
-  TfLiteTensor* cell_state =
-      GetVariableInput(context, node, kInputCellStateTensor);
+  TfLiteTensor* output_state =
+      GetVariableInput(context, node, kOutputStateTensor);
+  TF_LITE_ENSURE(context, output_state != nullptr);
+  TfLiteTensor* cell_state = GetVariableInput(context, node, kCellStateTensor);
   TF_LITE_ENSURE(context, cell_state != nullptr);
 
   // Since we have already checked that weights are all there or none, we can
@@ -445,18 +442,18 @@ TfLiteStatus PopulateQuantizedLstmParams8x8_8(
   int8_t* input_to_output_weight_ptr = nullptr;
   int8_t* recurrent_to_output_weight_ptr = nullptr;
   int8_t* cell_to_output_weight_ptr = nullptr;
-  int8_t* proj_weight_ptr = nullptr;
+  int8_t* projection_weight_ptr = nullptr;
   int16_t* layer_norm_input_weight_ptr = nullptr;
   int16_t* layer_norm_forget_weight_ptr = nullptr;
   int16_t* layer_norm_cell_weight_ptr = nullptr;
   int16_t* layer_norm_output_weight_ptr = nullptr;
-  int32_t* input_bias_ptr = nullptr;
-  int32_t* forget_bias_ptr = nullptr;
-  int32_t* cell_bias_ptr = nullptr;
-  int32_t* output_bias_ptr = nullptr;
-  int32_t* proj_bias_ptr = nullptr;
+  int32_t* input_gate_bias_ptr = nullptr;
+  int32_t* forget_gate_bias_ptr = nullptr;
+  int32_t* cell_gate_bias_ptr = nullptr;
+  int32_t* output_gate_bias_ptr = nullptr;
+  int32_t* projection_bias_ptr = nullptr;
   int16_t* cell_ptr = nullptr;
-  int8_t* activation_ptr = nullptr;
+  int8_t* output_state_ptr = nullptr;
 
   // Scales.
   const float default_scale = 1.0;
@@ -472,12 +469,12 @@ TfLiteStatus PopulateQuantizedLstmParams8x8_8(
   float input_to_output_weight_scale = default_scale;
   float recurrent_to_output_weight_scale = default_scale;
   float cell_to_output_weight_scale = default_scale;
-  float proj_weight_scale = default_scale;
+  float projection_weight_scale = default_scale;
   float layer_norm_input_scale = default_scale;
   float layer_norm_forget_scale = default_scale;
   float layer_norm_cell_scale = default_scale;
   float layer_norm_output_scale = default_scale;
-  float activation_scale = default_scale;
+  float output_state_scale = default_scale;
 
   // Effective scales.
   float effective_input_to_input_scale = default_scale;
@@ -495,13 +492,13 @@ TfLiteStatus PopulateQuantizedLstmParams8x8_8(
 
   // Zero points
   int input_zp = 0;
-  int activation_zp = 0;
+  int output_state_zp = 0;
 
   // Populate all the values.
   if (!use_cifg) {
     input_to_input_weight_ptr = input_to_input_weights->data.int8;
     recurrent_to_input_weight_ptr = recurrent_to_input_weights->data.int8;
-    input_bias_ptr = input_gate_bias->data.i32;
+    input_gate_bias_ptr = input_gate_bias->data.i32;
     input_to_input_weight_scale = input_to_input_weights->params.scale;
     recurrent_to_input_weight_scale = recurrent_to_input_weights->params.scale;
   }
@@ -531,13 +528,13 @@ TfLiteStatus PopulateQuantizedLstmParams8x8_8(
   }
 
   if (use_projection) {
-    proj_weight_ptr = projection_weights->data.int8;
-    proj_weight_scale = projection_weights->params.scale;
+    projection_weight_ptr = projection_weights->data.int8;
+    projection_weight_scale = projection_weights->params.scale;
     if (projection_bias) {
-      proj_bias_ptr = projection_bias->data.i32;
+      projection_bias_ptr = projection_bias->data.i32;
     }
   }
-  activation_scale = activation_state->params.scale;
+  output_state_scale = output_state->params.scale;
 
   input_to_forget_weight_ptr = input_to_forget_weights->data.int8;
   input_to_forget_weight_scale = input_to_forget_weights->params.scale;
@@ -551,14 +548,14 @@ TfLiteStatus PopulateQuantizedLstmParams8x8_8(
   recurrent_to_cell_weight_scale = recurrent_to_cell_weights->params.scale;
   recurrent_to_output_weight_ptr = recurrent_to_output_weights->data.int8;
   recurrent_to_output_weight_scale = recurrent_to_output_weights->params.scale;
-  forget_bias_ptr = forget_gate_bias->data.i32;
-  cell_bias_ptr = cell_bias->data.i32;
-  output_bias_ptr = output_gate_bias->data.i32;
-  activation_ptr = activation_state->data.int8;
+  forget_gate_bias_ptr = forget_gate_bias->data.i32;
+  cell_gate_bias_ptr = cell_gate_bias->data.i32;
+  output_gate_bias_ptr = output_gate_bias->data.i32;
+  output_state_ptr = output_state->data.int8;
   cell_ptr = cell_state->data.i16;
   input_scale = input->params.scale;
   input_zp = input->params.zero_point;
-  activation_zp = activation_state->params.zero_point;
+  output_state_zp = output_state->params.zero_point;
 
   std::vector<float> intermediate_scale;
   for (int i = 0; i < 12; ++i) {
@@ -575,27 +572,28 @@ TfLiteStatus PopulateQuantizedLstmParams8x8_8(
     effective_input_to_input_scale =
         input_to_input_weight_scale * input_scale / intermediate_scale[1];
     effective_recurrent_to_input_scale = recurrent_to_input_weight_scale *
-                                         activation_scale /
+                                         output_state_scale /
                                          intermediate_scale[2];
   }
   effective_input_to_forget_scale =
       input_to_forget_weight_scale * input_scale / intermediate_scale[4];
   effective_recurrent_to_forget_scale = recurrent_to_forget_weight_scale *
-                                        activation_scale /
+                                        output_state_scale /
                                         intermediate_scale[5];
 
   effective_input_to_cell_scale =
       input_to_cell_weight_scale * input_scale / intermediate_scale[7];
-  effective_recurrent_to_cell_scale =
-      recurrent_to_cell_weight_scale * activation_scale / intermediate_scale[8];
+  effective_recurrent_to_cell_scale = recurrent_to_cell_weight_scale *
+                                      output_state_scale /
+                                      intermediate_scale[8];
 
   effective_input_to_output_scale =
       input_to_output_weight_scale * input_scale / intermediate_scale[10];
   effective_recurrent_to_output_scale = recurrent_to_output_weight_scale *
-                                        activation_scale /
+                                        output_state_scale /
                                         intermediate_scale[11];
   effective_proj_scale =
-      proj_weight_scale * std::pow(2, -15) / activation_scale;
+      projection_weight_scale * std::pow(2, -15) / output_state_scale;
 
   if (use_peephole) {
     if (!use_cifg) {
@@ -698,24 +696,23 @@ TfLiteStatus PopulateQuantizedLstmParams8x8_8(
   const float cell_clip = params->cell_clip;
   const float proj_clip = params->proj_clip;
 
-  const TfLiteTensor* cell_tensor =
-      GetInput(context, node, kInputCellStateTensor);
   const TfLiteTensor* output_tensor = GetOutput(context, node, kOutputTensor);
 
-  auto* cell_params = reinterpret_cast<TfLiteAffineQuantization*>(
-      cell_tensor->quantization.params);
+  auto* cell_state_params = reinterpret_cast<TfLiteAffineQuantization*>(
+      cell_state->quantization.params);
   auto* proj_params = reinterpret_cast<TfLiteAffineQuantization*>(
       output_tensor->quantization.params);
-  TF_LITE_ENSURE_EQ(context, cell_params->scale->data[0], 1.0 / 32768);
+  TF_LITE_ENSURE_EQ(context, cell_state_params->scale->data[0], 1.0 / 32768);
   if (cell_clip > 0.0 && cell_clip < 1.0) {
-    integer_lstm_param->quantized_cell_clip =
-        static_cast<int>(cell_clip / cell_params->scale->data[0]);
+    integer_lstm_param->quantized_cell_clip = static_cast<int16_t>(std::min(
+        std::max(cell_clip / cell_state_params->scale->data[0], -32768.0f),
+        32767.0f));
   } else {
     integer_lstm_param->quantized_cell_clip = 0;
   }
   if (proj_clip > 0.0) {
-    integer_lstm_param->quantized_proj_clip =
-        proj_clip / proj_params->scale->data[0];
+    integer_lstm_param->quantized_proj_clip = static_cast<int8_t>(std::min(
+        std::max(proj_clip / proj_params->scale->data[0], -128.0f), 127.0f));
   } else {
     integer_lstm_param->quantized_proj_clip = 0;
   }
@@ -879,13 +876,14 @@ TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
     TF_LITE_ENSURE_TYPES_EQ(context, forget_gate_bias->type, kTfLiteFloat32);
   }
 
-  const TfLiteTensor* cell_bias = GetInput(context, node, kCellGateBiasTensor);
-  TF_LITE_ENSURE_EQ(context, cell_bias->dims->size, 1);
-  TF_LITE_ENSURE_EQ(context, cell_bias->dims->data[0], n_cell);
+  const TfLiteTensor* cell_gate_bias =
+      GetInput(context, node, kCellGateBiasTensor);
+  TF_LITE_ENSURE_EQ(context, cell_gate_bias->dims->size, 1);
+  TF_LITE_ENSURE_EQ(context, cell_gate_bias->dims->data[0], n_cell);
   if (is_integer) {
-    TF_LITE_ENSURE_TYPES_EQ(context, cell_bias->type, kTfLiteInt32);
+    TF_LITE_ENSURE_TYPES_EQ(context, cell_gate_bias->type, kTfLiteInt32);
   } else {
-    TF_LITE_ENSURE_TYPES_EQ(context, cell_bias->type, kTfLiteFloat32);
+    TF_LITE_ENSURE_TYPES_EQ(context, cell_gate_bias->type, kTfLiteFloat32);
   }
 
   const TfLiteTensor* output_gate_bias =
@@ -1026,12 +1024,12 @@ TfLiteStatus PopulatePrecomputedZPTimesWeightsWithBias(TfLiteContext* context,
                                                        OpData* op_data,
                                                        TfLiteNode* node) {
   const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  const TfLiteTensor* activation_state =
-      GetVariableInput(context, node, kInputActivationStateTensor);
-  TF_LITE_ENSURE(context, activation_state != nullptr);
+  const TfLiteTensor* output_state =
+      GetVariableInput(context, node, kOutputStateTensor);
+  TF_LITE_ENSURE(context, output_state != nullptr);
 
   const int32_t input_zero_point = -input->params.zero_point;
-  const int32_t activation_zero_point = -activation_state->params.zero_point;
+  const int32_t output_state_zero_point = -output_state->params.zero_point;
 
   const TfLiteTensor* input_to_input_weights =
       GetOptionalInputTensor(context, node, kInputToInputWeightsTensor);
@@ -1083,8 +1081,8 @@ TfLiteStatus PopulatePrecomputedZPTimesWeightsWithBias(TfLiteContext* context,
   TF_LITE_ENSURE_OK(
       context,
       PrecomputeZeroPointTimesWeightWithBias(
-          context, activation_zero_point, recurrent_to_forget_weights, nullptr,
-          &(integer_lstm_params->recurrent_to_forget_effective_bias)));
+          context, output_state_zero_point, recurrent_to_forget_weights,
+          nullptr, &(integer_lstm_params->recurrent_to_forget_effective_bias)));
 
   // Modulation gate.
   const TfLiteTensor* cell_gate_bias =
@@ -1097,7 +1095,7 @@ TfLiteStatus PopulatePrecomputedZPTimesWeightsWithBias(TfLiteContext* context,
   TF_LITE_ENSURE_OK(
       context,
       PrecomputeZeroPointTimesWeightWithBias(
-          context, activation_zero_point, recurrent_to_cell_weights, nullptr,
+          context, output_state_zero_point, recurrent_to_cell_weights, nullptr,
           &(integer_lstm_params->recurrent_to_cell_effective_bias)));
 
   // Output gate.
@@ -1112,8 +1110,8 @@ TfLiteStatus PopulatePrecomputedZPTimesWeightsWithBias(TfLiteContext* context,
   TF_LITE_ENSURE_OK(
       context,
       PrecomputeZeroPointTimesWeightWithBias(
-          context, activation_zero_point, recurrent_to_output_weights, nullptr,
-          &(integer_lstm_params->recurrent_to_output_effective_bias)));
+          context, output_state_zero_point, recurrent_to_output_weights,
+          nullptr, &(integer_lstm_params->recurrent_to_output_effective_bias)));
 
   // Input gate. The calculation is only meaningful for non-cifg case.
   const TfLiteTensor* input_gate_bias =
@@ -1126,7 +1124,7 @@ TfLiteStatus PopulatePrecomputedZPTimesWeightsWithBias(TfLiteContext* context,
   TF_LITE_ENSURE_OK(
       context,
       PrecomputeZeroPointTimesWeightWithBias(
-          context, activation_zero_point, recurrent_to_input_weights, nullptr,
+          context, output_state_zero_point, recurrent_to_input_weights, nullptr,
           &(integer_lstm_params->recurrent_to_input_effective_bias)));
 
   // Projection bias. The calculation is only meaningful for with projection.
@@ -1198,20 +1196,19 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
       context, CheckInputTensorDimensions(context, node, n_input, n_output,
                                           n_cell, use_layer_norm, is_integer));
 
-  // Get the pointer to output, activation_state and cell_state tensors.
+  // Get the pointer to output, output_state and cell_state tensors.
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
-  TfLiteTensor* activation_state =
-      GetVariableInput(context, node, kInputActivationStateTensor);
-  TF_LITE_ENSURE(context, activation_state != nullptr);
-  TfLiteTensor* cell_state =
-      GetVariableInput(context, node, kInputCellStateTensor);
+  TfLiteTensor* output_state =
+      GetVariableInput(context, node, kOutputStateTensor);
+  TF_LITE_ENSURE(context, output_state != nullptr);
+  TfLiteTensor* cell_state = GetVariableInput(context, node, kCellStateTensor);
   TF_LITE_ENSURE(context, cell_state != nullptr);
 
   // Check the shape of input state tensors.
   // These tensor may be 1D or 2D. It's fine as long as the total size is
   // correct.
-  TF_LITE_ENSURE_EQ(context, NumElements(activation_state), n_batch * n_output);
+  TF_LITE_ENSURE_EQ(context, NumElements(output_state), n_batch * n_output);
   TF_LITE_ENSURE_EQ(context, NumElements(cell_state), n_batch * n_cell);
 
   // Resize the output tensors.
@@ -1275,7 +1272,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   if (is_hybrid_op) {
     op_data->compute_row_sums = true;
     // Allocate temporary tensors to store quantized values of input,
-    // activation_state and cell_state tensors.
+    // output_state and cell_state tensors.
     node->temporaries->data[1] = op_data->scratch_tensor_index + 1;
     TfLiteTensor* input_quantized = GetTemporary(context, node, /*index=*/1);
     input_quantized->type = input_to_output_weights->type;
@@ -1286,17 +1283,17 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
                                                        input_quantized_size));
     }
     node->temporaries->data[2] = op_data->scratch_tensor_index + 2;
-    TfLiteTensor* activation_state_quantized =
+    TfLiteTensor* output_state_quantized =
         GetTemporary(context, node, /*index=*/2);
-    activation_state_quantized->type = input_to_output_weights->type;
-    activation_state_quantized->allocation_type = kTfLiteArenaRw;
-    if (!TfLiteIntArrayEqual(activation_state_quantized->dims,
-                             activation_state->dims)) {
-      TfLiteIntArray* activation_state_quantized_size =
-          TfLiteIntArrayCopy(activation_state->dims);
-      TF_LITE_ENSURE_OK(
-          context, context->ResizeTensor(context, activation_state_quantized,
-                                         activation_state_quantized_size));
+    output_state_quantized->type = input_to_output_weights->type;
+    output_state_quantized->allocation_type = kTfLiteArenaRw;
+    if (!TfLiteIntArrayEqual(output_state_quantized->dims,
+                             output_state->dims)) {
+      TfLiteIntArray* output_state_quantized_size =
+          TfLiteIntArrayCopy(output_state->dims);
+      TF_LITE_ENSURE_OK(context,
+                        context->ResizeTensor(context, output_state_quantized,
+                                              output_state_quantized_size));
     }
     node->temporaries->data[3] = op_data->scratch_tensor_index + 3;
     TfLiteTensor* cell_state_quantized =
@@ -1531,7 +1528,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       GetOptionalInputTensor(context, node, kInputGateBiasTensor);
   const TfLiteTensor* forget_gate_bias =
       GetInput(context, node, kForgetGateBiasTensor);
-  const TfLiteTensor* cell_bias = GetInput(context, node, kCellGateBiasTensor);
+  const TfLiteTensor* cell_gate_bias =
+      GetInput(context, node, kCellGateBiasTensor);
   const TfLiteTensor* output_gate_bias =
       GetInput(context, node, kOutputGateBiasTensor);
 
@@ -1540,11 +1538,10 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   const TfLiteTensor* projection_bias =
       GetOptionalInputTensor(context, node, kProjectionBiasTensor);
 
-  TfLiteTensor* activation_state =
-      GetVariableInput(context, node, kInputActivationStateTensor);
-  TF_LITE_ENSURE(context, activation_state != nullptr);
-  TfLiteTensor* cell_state =
-      GetVariableInput(context, node, kInputCellStateTensor);
+  TfLiteTensor* output_state =
+      GetVariableInput(context, node, kOutputStateTensor);
+  TF_LITE_ENSURE(context, output_state != nullptr);
+  TfLiteTensor* cell_state = GetVariableInput(context, node, kCellStateTensor);
   TF_LITE_ENSURE(context, cell_state != nullptr);
 
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
@@ -1566,10 +1563,11 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
           /*aux_input_to_forget_weights=*/nullptr,
           /*aux_input_to_cell_weights=*/nullptr,
           /*aux_input_to_output_weights=*/nullptr, input_gate_bias,
-          forget_gate_bias, cell_bias, output_gate_bias, projection_weights,
-          projection_bias, params, /*forward_sequence=*/true,
+          forget_gate_bias, cell_gate_bias, output_gate_bias,
+          projection_weights, projection_bias, params,
+          /*forward_sequence=*/true,
           /*time_major=*/true,
-          /*output_offset=*/0, scratch_buffer, activation_state, cell_state,
+          /*output_offset=*/0, scratch_buffer, output_state, cell_state,
           output);
     }
     case kTfLiteUInt8:
@@ -1580,7 +1578,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
         TfLiteTensor* scratch_buffer = GetTemporary(context, node, /*index=*/0);
         TfLiteTensor* input_quantized =
             GetTemporary(context, node, /*index=*/1);
-        TfLiteTensor* activation_state_quantized =
+        TfLiteTensor* output_state_quantized =
             GetTemporary(context, node, /*index=*/2);
         TfLiteTensor* cell_state_quantized =
             GetTemporary(context, node, /*index=*/3);
@@ -1609,13 +1607,14 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
             /*aux_input_to_forget_weights=*/nullptr,
             /*aux_input_to_cell_weights=*/nullptr,
             /*aux_input_to_output_weights=*/nullptr, input_gate_bias,
-            forget_gate_bias, cell_bias, output_gate_bias, projection_weights,
-            projection_bias, params, /*forward_sequence=*/true,
+            forget_gate_bias, cell_gate_bias, output_gate_bias,
+            projection_weights, projection_bias, params,
+            /*forward_sequence=*/true,
             /*time_major=*/true, /*output_offset=*/0, scratch_buffer,
             scaling_factors, prod_scaling_factors, recovered_cell_weights,
             input_quantized,
-            /*aux_input_quantized=*/nullptr, activation_state_quantized,
-            cell_state_quantized, activation_state, cell_state,
+            /*aux_input_quantized=*/nullptr, output_state_quantized,
+            cell_state_quantized, output_state, cell_state,
             output_scratch_buffer, output, zero_points, row_sums, row_sums_size,
             &op_data->compute_row_sums,
             CpuBackendContext::GetFromContext(context));
@@ -1637,10 +1636,11 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
               cell_to_output_weights, input_layer_norm_coefficients,
               forget_layer_norm_coefficients, cell_layer_norm_coefficients,
               output_layer_norm_coefficients, input_gate_bias, forget_gate_bias,
-              cell_bias, output_gate_bias, projection_weights, projection_bias,
-              params, &op_data->integer_lstm_param, activation_state,
-              cell_state, output, scratch0, scratch1, scratch2, scratch3,
-              scratch4, scratch5, CpuBackendContext::GetFromContext(context));
+              cell_gate_bias, output_gate_bias, projection_weights,
+              projection_bias, params, &op_data->integer_lstm_param,
+              output_state, cell_state, output, scratch0, scratch1, scratch2,
+              scratch3, scratch4, scratch5,
+              CpuBackendContext::GetFromContext(context));
         } else {
           TfLiteTensor* scratch0 = GetTemporary(context, node, /*index=*/0);
           TfLiteTensor* scratch1 = GetTemporary(context, node, /*index=*/1);
@@ -1659,8 +1659,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
               cell_to_output_weights, input_layer_norm_coefficients,
               forget_layer_norm_coefficients, cell_layer_norm_coefficients,
               output_layer_norm_coefficients, input_gate_bias, forget_gate_bias,
-              cell_bias, output_gate_bias, projection_weights, projection_bias,
-              params, activation_state, cell_state, output,
+              cell_gate_bias, output_gate_bias, projection_weights,
+              projection_bias, params, output_state, cell_state, output,
               &op_data->integer_lstm_param, scratch0, scratch1, scratch2,
               scratch3, scratch4, scratch5, scratch6, scratch7);
           return kTfLiteOk;
diff --git a/tensorflow/lite/kernels/lstm_eval.cc b/tensorflow/lite/kernels/lstm_eval.cc
index b4d43414d89..ca8344d863b 100644
--- a/tensorflow/lite/kernels/lstm_eval.cc
+++ b/tensorflow/lite/kernels/lstm_eval.cc
@@ -216,9 +216,8 @@ inline void LstmStepFloat(
     const float* projection_weights_ptr, const float* projection_bias_ptr,
     const TfLiteLSTMParams* params, int n_batch, int n_cell, int n_input,
     int n_aux_input, int n_output, int output_batch_leading_dim,
-    float* output_state_ptr, float* cell_state_ptr, float* input_gate_scratch,
-    float* forget_gate_scratch, float* cell_gate_scratch,
-    float* output_gate_scratch, float* output_ptr) {
+    float* output_state_ptr, float* cell_state_ptr, float* scratch0,
+    float* scratch1, float* scratch2, float* scratch3, float* output_ptr) {
   ruy::profiler::ScopeLabel label("LstmStepFloat");
   // Since we have already checked that weights are all there or none, we can
   // check the existence of only one to the get the condition.
@@ -226,6 +225,12 @@ inline void LstmStepFloat(
   const bool use_peephole = (cell_to_output_weights_ptr != nullptr);
   const bool use_layer_norm = (forget_layer_norm_coefficients_ptr != nullptr);
 
+  // Make named scratch buffers for the different gates.
+  float* input_gate_scratch = scratch0;
+  float* forget_gate_scratch = scratch1;
+  float* cell_gate_scratch = scratch2;
+  float* output_gate_scratch = scratch3;
+
   // Initialize scratch buffers with bias for regular lstm or initialize with
   // zero for layer norm lstm.
   if (use_layer_norm) {
@@ -485,7 +490,7 @@ inline void LstmStepFloat(
 // Temporary pre-allocated storage for quantized values:
 //   quantized_input_ptr (same size as input_ptr)
 //   quantized_output_state_ptr (same size as output_state_ptr)
-//   quantized_cell_state_ptr (same size as cell_state_ptr)
+//   quantized_output_scratch (same size as cell_state_ptr)
 // Temporary pre-allocated storage for recovered values:
 //   recovered_cell_weights (same size as cell_to_*_weights)
 //
@@ -531,12 +536,11 @@ inline void LstmStepHybrid(
     const int8_t* projection_weights_ptr, float projection_weights_scale,
     const float* projection_bias_ptr, const TfLiteLSTMParams* params,
     int n_batch, int n_cell, int n_input, int n_aux_input, int n_output,
-    int output_batch_leading_dim, float* input_gate_scratch,
-    float* forget_gate_scratch, float* cell_gate_scratch,
-    float* output_gate_scratch, float* scaling_factors,
+    int output_batch_leading_dim, float* scratch0, float* scratch1,
+    float* scratch2, float* scratch3, float* scaling_factors,
     float* scaling_factors_scratch, float* recovered_cell_weights,
     int8_t* quantized_input_ptr, int8_t* quantized_aux_input_ptr,
-    int8_t* quantized_output_state_ptr, int8_t* quantized_cell_state_ptr,
+    int8_t* quantized_output_state_ptr, int8_t* quantized_output_scratch,
     float* output_state_ptr, float* cell_state_ptr, int32_t* accum_scratch_ptr,
     float* output_ptr, int32_t* zero_points, int32_t* row_sums,
     int row_sums_size, bool* compute_row_sums, bool asymmetric_quantize_inputs,
@@ -548,6 +552,12 @@ inline void LstmStepHybrid(
   const bool use_peephole = (cell_to_output_weights_ptr != nullptr);
   const bool use_layer_norm = (forget_layer_norm_coefficients_ptr != nullptr);
 
+  // Make named scratch buffers for the different gates.
+  float* input_gate_scratch = scratch0;
+  float* forget_gate_scratch = scratch1;
+  float* cell_gate_scratch = scratch2;
+  float* output_gate_scratch = scratch3;
+
   // Initialize scratch buffers with bias for regular lstm or initialize with
   // zero for layer norm lstm.
   if (use_layer_norm) {
@@ -872,18 +882,15 @@ inline void LstmStepHybrid(
     if (!tensor_utils::IsZeroVector(output_gate_scratch, n_batch * n_cell)) {
       // Save quantization and matmul computation for all zero input.
       tensor_utils::BatchQuantizeFloats(
-          output_gate_scratch, n_batch, n_cell, quantized_cell_state_ptr,
+          output_gate_scratch, n_batch, n_cell, quantized_output_scratch,
           scaling_factors, zero_points, asymmetric_quantize_inputs);
-      for (int b = 0; b < n_batch; ++b) {
-        scaling_factors_scratch[b] =
-            scaling_factors[b] * projection_weights_scale;
-      }
       tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-          projection_weights_ptr, n_output, n_cell, quantized_cell_state_ptr,
-          scaling_factors_scratch, n_batch, output_state_ptr,
+          projection_weights_ptr, n_output, n_cell, quantized_output_scratch,
+          projection_weights_scale, scaling_factors, n_batch, output_state_ptr,
           /*per_channel_scale=*/nullptr,
           asymmetric_quantize_inputs ? zero_points : nullptr, accum_scratch_ptr,
-          projection_weights_row_sums, compute_row_sums, context);
+          projection_weights_row_sums, compute_row_sums,
+          scaling_factors_scratch, context);
     }
     if (params->proj_clip > 0.0) {
       tensor_utils::ClipVector(output_state_ptr, n_batch * n_output,
@@ -896,11 +903,11 @@ inline void LstmStepHybrid(
     std::copy_n(output_state_ptr + b * n_output, n_output,
                 output_ptr + b * output_batch_leading_dim);
   }
-}  // namespace
+}
 
 // Fully quantized lstm kernel for 16 bit gate matmul output.
 //
-// Input activation of size n_batch * n_input:
+// Input tensor of size n_batch * n_input:
 //   input_ptr
 //
 // LSTM weights:
@@ -922,7 +929,7 @@ inline void LstmStepHybrid(
 //   cell_to_output_weights              - optional
 //
 // Quantized projection weights of size 'n_output * n_cell'
-//   proj_weight_ptr                     - optional
+//   projection_weight_ptr                     - optional
 //
 // Weight scales (scalars) for each of the weights above.
 //   effective_input_to_input_scale_a    - optional
@@ -945,10 +952,10 @@ inline void LstmStepHybrid(
 //   effective_proj_scale_b                  - optional
 //
 // Gate biases of size 'n_cell':
-//   input_bias_ptr                 - optional
-//   forget_bias_ptr
+//   input_gate_bias_ptr                 - optional
+//   forget_gate_bias_ptr
 //   cell_gate_bias_ptr
-//   output_bias_ptr
+//   output_gate_bias_ptr
 //
 // Layer norm coefficients of size 'n_cell', representing diagonal matrices.
 //   layer_norm_input_weight_ptr    - optional
@@ -969,27 +976,27 @@ inline void LstmStepHybrid(
 // Scalar values:
 //   quantized_cell_clip: quantized clip value for cell.
 //   quantized_proj_clip: quantized clip value for projection.
-//   cell_scale: the power of two scale for cell state.
+//   cell_state_scale: the power of two scale for cell state.
 //
 // Zero points:
-//   activation_zp: zero point of activation
+//   output_state_zp: zero point of output state
 //   hidden_zp: zero point for hidden state.
 //
 // Temporary pre-allocated storage for the calculation. Each is of size n_cell *
 // n_batch.
-//   scratch_0
-//   scratch_1
-//   scratch_2
-//   scratch_3
-//   scratch_4
-//   scratch_5: this scratch buffer is created purely for optimizing the
+//   scratch0
+//   scratch1
+//   scratch2
+//   scratch3
+//   scratch4
+//   scratch5: this scratch buffer is created purely for optimizing the
 //              MatrixBatchVectorMultiplyAccumulate.
 //
 // Outputs:
 //   output_state_ptr - size 'n_batch * n_output'
 //   cell_state_ptr   - size 'n_batch * n_cell'
 //   output_ptr       - size 'n_batch * n_output'
-inline void LstmStepInteger(
+inline void LstmStepInteger8x8_16(
     const int8_t* input_ptr, const int8_t* input_to_input_weight_ptr,
     int32_t effective_input_to_input_scale_a,
     int32_t effective_input_to_input_scale_b,
@@ -1022,10 +1029,10 @@ inline void LstmStepInteger(
     int32_t effective_cell_to_forget_scale_b,
     const int16_t* cell_to_output_weight_ptr,
     int32_t effective_cell_to_output_scale_a,
-    int32_t effective_cell_to_output_scale_b, const int8_t* proj_weight_ptr,
-    int32_t effective_proj_scale_a, int32_t effective_proj_scale_b,
-    int32_t hidden_zp, int32_t effective_hidden_scale_a,
-    int32_t effective_hidden_scale_b,
+    int32_t effective_cell_to_output_scale_b,
+    const int8_t* projection_weight_ptr, int32_t effective_proj_scale_a,
+    int32_t effective_proj_scale_b, int32_t hidden_zp,
+    int32_t effective_hidden_scale_a, int32_t effective_hidden_scale_b,
     const int16_t* layer_norm_input_weight_ptr,
     int32_t layer_norm_input_scale_a, int32_t layer_norm_input_scale_b,
     const int16_t* layer_norm_forget_weight_ptr,
@@ -1034,11 +1041,12 @@ inline void LstmStepInteger(
     int32_t layer_norm_cell_scale_b,
     const int16_t* layer_norm_output_weight_ptr,
     int32_t layer_norm_output_scale_a, int32_t layer_norm_output_scale_b,
-    const int32_t* input_bias_ptr, const int32_t* forget_bias_ptr,
-    const int32_t* cell_gate_bias_ptr, const int32_t* output_bias_ptr,
-    int16_t quantized_cell_clip, int8_t quantized_proj_clip, int32_t cell_scale,
-    int32_t input_variance_guard, int32_t forget_variance_guard,
-    int32_t cell_variance_guard, int32_t output_variance_guard,
+    const int32_t* input_gate_bias_ptr, const int32_t* forget_gate_bias_ptr,
+    const int32_t* cell_gate_bias_ptr, const int32_t* output_gate_bias_ptr,
+    int16_t quantized_cell_clip, int8_t quantized_proj_clip,
+    int32_t cell_state_scale, int32_t input_variance_guard,
+    int32_t forget_variance_guard, int32_t cell_variance_guard,
+    int32_t output_variance_guard,
     const int32_t* input_to_forget_effective_bias,
     const int32_t* recurrent_to_forget_effective_bias,
     const int32_t* input_to_cell_effective_bias,
@@ -1047,18 +1055,23 @@ inline void LstmStepInteger(
     const int32_t* recurrent_to_output_effective_bias,
     const int32_t* input_to_input_effective_bias,
     const int32_t* recurrent_to_input_effective_bias,
-    const int32_t* projection_effective_bias, int32 n_batch, int32 n_cell,
-    int32 n_input, int32 n_output, int8_t* activation_ptr,
-    int32_t activation_zp, int16_t* cell_ptr, int8_t* output_ptr,
-    int16_t* scratch_0_ptr, int16_t* scratch_1_ptr, int16_t* scratch_2_ptr,
-    int16_t* scratch_3_ptr, int8_t* scratch_4_ptr, int32_t* scratch_5_ptr,
-    CpuBackendContext* context) {
-  ruy::profiler::ScopeLabel label("LstmStepInteger");
+    const int32_t* projection_effective_bias, int n_batch, int n_cell,
+    int n_input, int n_output, int8_t* output_state_ptr,
+    int32_t output_state_zp, int16_t* cell_state_ptr, int8_t* output_ptr,
+    int16_t* scratch0, int16_t* scratch1, int16_t* scratch2, int16_t* scratch3,
+    int8_t* scratch4, int32_t* scratch5, CpuBackendContext* context) {
+  ruy::profiler::ScopeLabel label("LstmStepInteger8x8_16");
+  // Make named scratch buffers for the different gates.
+  int16_t* input_gate_scratch = scratch0;
+  int16_t* forget_gate_scratch = scratch1;
+  int16_t* cell_gate_scratch = scratch2;
+  int16_t* output_gate_scratch = scratch3;
+
   // Get hyper parameters.
   const bool use_cifg = (input_to_input_weight_ptr == nullptr);
   const bool use_peephole = (cell_to_output_weight_ptr != nullptr);
   const bool use_layer_norm = (layer_norm_forget_weight_ptr != nullptr);
-  const bool use_projection = (proj_weight_ptr != nullptr);
+  const bool use_projection = (projection_weight_ptr != nullptr);
 
   // Check for nullptrs.
   TFLITE_DCHECK(input_to_forget_effective_bias);
@@ -1075,158 +1088,164 @@ inline void LstmStepInteger(
 
   // Set scratch to 0.
   if (!use_cifg) {
-    std::fill_n(scratch_0_ptr, n_batch * n_cell, 0);
+    std::fill_n(input_gate_scratch, n_batch * n_cell, 0);
   }
-  std::fill_n(scratch_1_ptr, n_batch * n_cell, 0);
-  std::fill_n(scratch_2_ptr, n_batch * n_cell, 0);
-  std::fill_n(scratch_3_ptr, n_batch * n_cell, 0);
+  std::fill_n(forget_gate_scratch, n_batch * n_cell, 0);
+  std::fill_n(cell_gate_scratch, n_batch * n_cell, 0);
+  std::fill_n(output_gate_scratch, n_batch * n_cell, 0);
 
   // Forget gate.
   tensor_utils::MatrixBatchVectorMultiplyAccumulate(
       input_ptr, input_to_forget_effective_bias, input_to_forget_weight_ptr,
       effective_input_to_forget_scale_a, effective_input_to_forget_scale_b,
-      n_batch, n_input, n_cell, 0, scratch_5_ptr, scratch_1_ptr, context);
+      n_batch, n_input, n_cell, 0, scratch5, forget_gate_scratch, context);
 
   tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-      activation_ptr, recurrent_to_forget_effective_bias,
+      output_state_ptr, recurrent_to_forget_effective_bias,
       recurrent_to_forget_weight_ptr, effective_recurrent_to_forget_scale_a,
       effective_recurrent_to_forget_scale_b, n_batch, n_output, n_cell, 0,
-      scratch_5_ptr, scratch_1_ptr, context);
+      scratch5, forget_gate_scratch, context);
   if (use_peephole) {
     tensor_utils::VectorBatchVectorCwiseProductAccumulate(
-        cell_to_forget_weight_ptr, n_output, cell_ptr, n_batch,
+        cell_to_forget_weight_ptr, n_output, cell_state_ptr, n_batch,
         effective_cell_to_forget_scale_a, effective_cell_to_forget_scale_b,
-        scratch_1_ptr);
+        forget_gate_scratch);
   }
 
   if (use_layer_norm) {
     tensor_utils::ApplyLayerNorm(
-        scratch_1_ptr, layer_norm_forget_weight_ptr, forget_bias_ptr,
+        forget_gate_scratch, layer_norm_forget_weight_ptr, forget_gate_bias_ptr,
         layer_norm_forget_scale_a, layer_norm_forget_scale_b,
-        forget_variance_guard, n_batch, n_cell, scratch_1_ptr);
+        forget_variance_guard, n_batch, n_cell, forget_gate_scratch);
   }
 
-  tensor_utils::ApplySigmoid(scratch_1_ptr, n_batch, n_cell, scratch_1_ptr);
+  tensor_utils::ApplySigmoid(forget_gate_scratch, n_batch, n_cell,
+                             forget_gate_scratch);
 
-  // Modulation gate.
+  // Cell gate.
   tensor_utils::MatrixBatchVectorMultiplyAccumulate(
       input_ptr, input_to_cell_effective_bias, input_to_cell_weight_ptr,
       effective_input_to_cell_scale_a, effective_input_to_cell_scale_b, n_batch,
-      n_input, n_cell, 0, scratch_5_ptr, scratch_2_ptr, context);
+      n_input, n_cell, 0, scratch5, cell_gate_scratch, context);
 
   tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-      activation_ptr, recurrent_to_cell_effective_bias,
+      output_state_ptr, recurrent_to_cell_effective_bias,
       recurrent_to_cell_weight_ptr, effective_recurrent_to_cell_scale_a,
       effective_recurrent_to_cell_scale_b, n_batch, n_output, n_cell, 0,
-      scratch_5_ptr, scratch_2_ptr, context);
+      scratch5, cell_gate_scratch, context);
 
   if (use_layer_norm) {
-    tensor_utils::ApplyLayerNorm(scratch_2_ptr, layer_norm_cell_weight_ptr,
+    tensor_utils::ApplyLayerNorm(cell_gate_scratch, layer_norm_cell_weight_ptr,
                                  cell_gate_bias_ptr, layer_norm_cell_scale_a,
                                  layer_norm_cell_scale_b, cell_variance_guard,
-                                 n_batch, n_cell, scratch_2_ptr);
+                                 n_batch, n_cell, cell_gate_scratch);
   }
 
-  tensor_utils::ApplyTanh(3, scratch_2_ptr, n_batch, n_cell, scratch_2_ptr);
+  tensor_utils::ApplyTanh(3, cell_gate_scratch, n_batch, n_cell,
+                          cell_gate_scratch);
 
   // Input gate.
   if (use_cifg) {
-    tensor_utils::Sub1Vector(scratch_1_ptr, n_batch * n_cell, scratch_0_ptr);
+    tensor_utils::Sub1Vector(forget_gate_scratch, n_batch * n_cell,
+                             input_gate_scratch);
   } else {
     tensor_utils::MatrixBatchVectorMultiplyAccumulate(
         input_ptr, input_to_input_effective_bias, input_to_input_weight_ptr,
         effective_input_to_input_scale_a, effective_input_to_input_scale_b,
-        n_batch, n_input, n_cell, 0, scratch_5_ptr, scratch_0_ptr, context);
+        n_batch, n_input, n_cell, 0, scratch5, input_gate_scratch, context);
 
     tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-        activation_ptr, recurrent_to_input_effective_bias,
+        output_state_ptr, recurrent_to_input_effective_bias,
         recurrent_to_input_weight_ptr, effective_recurrent_to_input_scale_a,
         effective_recurrent_to_input_scale_b, n_batch, n_output, n_cell, 0,
-        scratch_5_ptr, scratch_0_ptr, context);
+        scratch5, input_gate_scratch, context);
     if (use_peephole) {
       tensor_utils::VectorBatchVectorCwiseProductAccumulate(
-          cell_to_input_weight_ptr, n_output, cell_ptr, n_batch,
+          cell_to_input_weight_ptr, n_output, cell_state_ptr, n_batch,
           effective_cell_to_input_scale_a, effective_cell_to_input_scale_b,
-          scratch_0_ptr);
+          input_gate_scratch);
     }
 
     if (use_layer_norm) {
       tensor_utils::ApplyLayerNorm(
-          scratch_0_ptr, layer_norm_input_weight_ptr, input_bias_ptr,
+          input_gate_scratch, layer_norm_input_weight_ptr, input_gate_bias_ptr,
           layer_norm_input_scale_a, layer_norm_input_scale_b,
-          input_variance_guard, n_batch, n_cell, scratch_0_ptr);
+          input_variance_guard, n_batch, n_cell, input_gate_scratch);
     }
-    tensor_utils::ApplySigmoid(scratch_0_ptr, n_batch, n_cell, scratch_0_ptr);
+    tensor_utils::ApplySigmoid(input_gate_scratch, n_batch, n_cell,
+                               input_gate_scratch);
   }
 
-  // New cell.
-  tensor_utils::CwiseMul(scratch_1_ptr, cell_ptr, n_batch, n_cell, 15,
-                         scratch_1_ptr);
+  // New cell state.
+  tensor_utils::CwiseMul(forget_gate_scratch, cell_state_ptr, n_batch, n_cell,
+                         15, forget_gate_scratch);
 
-  tensor_utils::CwiseMul(scratch_0_ptr, scratch_2_ptr, n_batch, n_cell,
-                         30 + cell_scale, scratch_2_ptr);
+  tensor_utils::CwiseMul(input_gate_scratch, cell_gate_scratch, n_batch, n_cell,
+                         30 + cell_state_scale, cell_gate_scratch);
 
-  tensor_utils::CwiseAdd(scratch_1_ptr, scratch_2_ptr, n_batch, n_cell,
-                         cell_ptr);
+  tensor_utils::CwiseAdd(forget_gate_scratch, cell_gate_scratch, n_batch,
+                         n_cell, cell_state_ptr);
 
   if (quantized_cell_clip > 0) {
-    tensor_utils::CwiseClipping(cell_ptr, quantized_cell_clip, n_batch, n_cell);
+    tensor_utils::CwiseClipping(cell_state_ptr, quantized_cell_clip, n_batch,
+                                n_cell);
   }
 
   // Ouptut gate.
   tensor_utils::MatrixBatchVectorMultiplyAccumulate(
       input_ptr, input_to_output_effective_bias, input_to_output_weight_ptr,
       effective_input_to_output_scale_a, effective_input_to_output_scale_b,
-      n_batch, n_input, n_cell, 0, scratch_5_ptr, scratch_3_ptr, context);
+      n_batch, n_input, n_cell, 0, scratch5, output_gate_scratch, context);
 
   tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-      activation_ptr, recurrent_to_output_effective_bias,
+      output_state_ptr, recurrent_to_output_effective_bias,
       recurrent_to_output_weight_ptr, effective_recurrent_to_output_scale_a,
       effective_recurrent_to_output_scale_b, n_batch, n_output, n_cell, 0,
-      scratch_5_ptr, scratch_3_ptr, context);
+      scratch5, output_gate_scratch, context);
   if (use_peephole) {
     tensor_utils::VectorBatchVectorCwiseProductAccumulate(
-        cell_to_output_weight_ptr, n_output, cell_ptr, n_batch,
+        cell_to_output_weight_ptr, n_output, cell_state_ptr, n_batch,
         effective_cell_to_output_scale_a, effective_cell_to_output_scale_b,
-        scratch_3_ptr);
+        output_gate_scratch);
   }
 
   if (use_layer_norm) {
     tensor_utils::ApplyLayerNorm(
-        scratch_3_ptr, layer_norm_output_weight_ptr, output_bias_ptr,
+        output_gate_scratch, layer_norm_output_weight_ptr, output_gate_bias_ptr,
         layer_norm_output_scale_a, layer_norm_output_scale_b,
-        output_variance_guard, n_batch, n_cell, scratch_3_ptr);
+        output_variance_guard, n_batch, n_cell, output_gate_scratch);
   }
 
-  tensor_utils::ApplySigmoid(scratch_3_ptr, n_batch, n_cell, scratch_3_ptr);
+  tensor_utils::ApplySigmoid(output_gate_scratch, n_batch, n_cell,
+                             output_gate_scratch);
 
   // Hidden.
-  tensor_utils::ApplyTanh(15 + cell_scale, cell_ptr, n_batch, n_cell,
-                          scratch_0_ptr);
+  tensor_utils::ApplyTanh(15 + cell_state_scale, cell_state_ptr, n_batch,
+                          n_cell, input_gate_scratch);
 
-  tensor_utils::CwiseMul(scratch_3_ptr, scratch_0_ptr, effective_hidden_scale_a,
-                         effective_hidden_scale_b, n_batch, n_cell, hidden_zp,
-                         scratch_4_ptr);
+  tensor_utils::CwiseMul(output_gate_scratch, input_gate_scratch,
+                         effective_hidden_scale_a, effective_hidden_scale_b,
+                         n_batch, n_cell, hidden_zp, scratch4);
   // Projection.
   if (use_projection) {
     std::fill_n(output_ptr, n_batch * n_output, 0);
     tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-        scratch_4_ptr, projection_effective_bias, proj_weight_ptr,
+        scratch4, projection_effective_bias, projection_weight_ptr,
         effective_proj_scale_a, effective_proj_scale_b, n_batch, n_cell,
-        n_output, activation_zp, scratch_5_ptr, output_ptr, context);
+        n_output, output_state_zp, scratch5, output_ptr, context);
     if (quantized_proj_clip > 0) {
       tensor_utils::CwiseClipping(output_ptr, quantized_proj_clip, n_batch,
                                   n_output);
     }
   } else {
-    std::copy_n(scratch_4_ptr, n_batch * n_output, output_ptr);
+    std::copy_n(scratch4, n_batch * n_output, output_ptr);
   }
-  std::copy_n(output_ptr, n_batch * n_output, activation_ptr);
+  std::copy_n(output_ptr, n_batch * n_output, output_state_ptr);
 }
 
 // Fully quantized lstm kernel for 8 bit gate matmul output.
 //
-// Input activation of size n_batch * n_input:
+// Input tensor of size n_batch * n_input:
 //   input_ptr
 //
 // LSTM weights:
@@ -1248,7 +1267,7 @@ inline void LstmStepInteger(
 //   cell_to_output_weights              - optional
 //
 // Quantized projection weights of size 'n_output * n_cell'
-//   proj_weight_ptr                     - optional
+//   projection_weight_ptr                     - optional
 //
 // Weight scales (scalars) for each of the weights above.
 //   effective_input_to_input_scale_a    - optional
@@ -1271,10 +1290,10 @@ inline void LstmStepInteger(
 //   effective_proj_scale_b                  - optional
 //
 // Gate biases of size 'n_cell':
-//   input_bias_ptr                 - optional
-//   forget_bias_ptr
+//   input_gate_bias_ptr                 - optional
+//   forget_gate_bias_ptr
 //   cell_gate_bias_ptr
-//   output_bias_ptr
+//   output_gate_bias_ptr
 //
 // Layer norm coefficients of size 'n_cell', representing diagonal matrices.
 //   layer_norm_input_weight_ptr    - optional
@@ -1295,29 +1314,29 @@ inline void LstmStepInteger(
 // Scalar values:
 //   quantized_cell_clip: quantized clip value for cell.
 //   quantized_proj_clip: quantized clip value for projection.
-//   cell_scale: the power of two scale for cell state.
+//   cell_state_scale: the power of two scale for cell state.
 //
 // Zero points:
-//   activation_zp: zero point of activation
+//   output_state_zp: zero point of output state.
 //   hidden_zp: zero point for hidden state.
 //
 // Temporary pre-allocated storage for the calculation. Each is of size n_cell *
 // n_batch.
-//   scratch_0
-//   scratch_1
-//   scratch_2
-//   scratch_3
-//   scratch_4
-//   scratch_5
-//   scratch_6
-//   scratch_7
+//   scratch0
+//   scratch1
+//   scratch2
+//   scratch3
+//   scratch4
+//   scratch5
+//   scratch6
+//   scratch7
 //
 // Outputs:
 //   output_state_ptr - size 'n_batch * n_output'
 //   cell_state_ptr   - size 'n_batch * n_cell'
 //   output_ptr       - size 'n_batch * n_output'
 // TODO(b/148688698): Move zero point calculation into Prepare().
-void LstmStepInteger(
+inline void LstmStepInteger8x8_8(
     const int8_t* input_ptr, int32_t input_zp,
     const int8_t* input_to_input_weight_ptr,
     int32_t effective_input_to_input_scale_a,
@@ -1351,9 +1370,9 @@ void LstmStepInteger(
     int32_t effective_cell_to_forget_scale_b,
     const int8_t* cell_to_output_weight_ptr,
     int32_t effective_cell_to_output_scale_a,
-    int32_t effective_cell_to_output_scale_b, const int8_t* proj_weight_ptr,
-    int32_t effective_proj_scale_a, int32_t effective_proj_scale_b,
-    const int16_t* layer_norm_input_weight_ptr,
+    int32_t effective_cell_to_output_scale_b,
+    const int8_t* projection_weight_ptr, int32_t effective_proj_scale_a,
+    int32_t effective_proj_scale_b, const int16_t* layer_norm_input_weight_ptr,
     int32_t layer_norm_input_scale_a, int32_t layer_norm_input_scale_b,
     const int16_t* layer_norm_forget_weight_ptr,
     int32_t layer_norm_forget_scale_a, int32_t layer_norm_forget_scale_b,
@@ -1361,17 +1380,24 @@ void LstmStepInteger(
     int32_t layer_norm_cell_scale_b,
     const int16_t* layer_norm_output_weight_ptr,
     int32_t layer_norm_output_scale_a, int32_t layer_norm_output_scale_b,
-    const int32_t* input_bias_ptr, const int32_t* forget_bias_ptr,
-    const int32_t* cell_gate_bias_ptr, const int32_t* output_bias_ptr,
-    const int32_t* proj_bias_ptr, const TfLiteLSTMParams* params,
+    const int32_t* input_gate_bias_ptr, const int32_t* forget_gate_bias_ptr,
+    const int32_t* cell_gate_bias_ptr, const int32_t* output_gate_bias_ptr,
+    const int32_t* projection_bias_ptr, const TfLiteLSTMParams* params,
     const int32_t* intermediate_scale_a, const int32_t* intermediate_scale_b,
-    const int32_t* intermediate_zp, int32 quantized_cell_clip,
-    int32 quantized_proj_clip, int32 n_batch, int32 n_cell, int32 n_input,
-    int32 n_output, int32 output_batch_leading_dim, int8_t* activation_ptr,
-    int32_t activation_zp, int16_t* cell_ptr, int8_t* output_ptr,
+    const int32_t* intermediate_zp, int16_t quantized_cell_clip,
+    int8_t quantized_proj_clip, int n_batch, int n_cell, int n_input,
+    int n_output, int output_batch_leading_dim, int8_t* output_state_ptr,
+    int32_t output_state_zp, int16_t* cell_state_ptr, int8_t* output_ptr,
     int8_t* scratch0, int8_t* scratch1, int16_t* scratch2, int16_t* scratch3,
     int16_t* scratch4, int16_t* scratch5, int16_t* scratch6,
     int16_t* scratch7) {
+  ruy::profiler::ScopeLabel label("LstmStepInteger8x8_8");
+  // Make named scratch buffers for the different gates.
+  int16_t* input_gate_scratch = scratch5;
+  int16_t* forget_gate_scratch = scratch2;
+  int16_t* cell_gate_scratch = scratch3;
+  int16_t* output_gate_scratch = scratch4;
+
   // Forget gate.
   std::fill_n(scratch0, n_batch * n_cell, 0);
   std::fill_n(scratch1, n_batch * n_cell, 0);
@@ -1381,7 +1407,7 @@ void LstmStepInteger(
       n_batch, n_input, n_cell, scratch0, intermediate_zp[4]);
 
   tensor_utils::MatrixBatchVectorMultiply(
-      activation_ptr, activation_zp, recurrent_to_forget_weight_ptr,
+      output_state_ptr, output_state_zp, recurrent_to_forget_weight_ptr,
       effective_recurrent_to_forget_scale_a,
       effective_recurrent_to_forget_scale_b, n_batch, n_output, n_cell,
       scratch1, intermediate_zp[5]);
@@ -1389,17 +1415,19 @@ void LstmStepInteger(
   tensor_utils::TwoGateSaturationgAdd(
       scratch0, intermediate_zp[4], scratch1, intermediate_zp[5],
       intermediate_scale_a[2], intermediate_scale_b[2], intermediate_scale_a[3],
-      intermediate_scale_b[3], n_batch, n_cell, scratch2);
+      intermediate_scale_b[3], n_batch, n_cell, forget_gate_scratch);
 
   // Forget gate layer norm.
   tensor_utils::ApplyLayerNormFloat(
-      scratch2, layer_norm_forget_weight_ptr, layer_norm_forget_scale_a,
-      layer_norm_forget_scale_b, forget_bias_ptr, n_batch, n_cell, scratch2);
+      forget_gate_scratch, layer_norm_forget_weight_ptr,
+      layer_norm_forget_scale_a, layer_norm_forget_scale_b,
+      forget_gate_bias_ptr, n_batch, n_cell, forget_gate_scratch);
 
   // Forget gate sigmoid.
-  tensor_utils::ApplySigmoidFloat(scratch2, n_batch, n_cell, scratch2);
+  tensor_utils::ApplySigmoidFloat(forget_gate_scratch, n_batch, n_cell,
+                                  forget_gate_scratch);
 
-  // Update gate.
+  // Cell gate.
   std::fill_n(scratch0, n_batch * n_cell, 0);
   std::fill_n(scratch1, n_batch * n_cell, 0);
   tensor_utils::MatrixBatchVectorMultiply(
@@ -1408,22 +1436,24 @@ void LstmStepInteger(
       n_input, n_cell, scratch0, intermediate_zp[7]);
 
   tensor_utils::MatrixBatchVectorMultiply(
-      activation_ptr, activation_zp, recurrent_to_cell_weight_ptr,
+      output_state_ptr, output_state_zp, recurrent_to_cell_weight_ptr,
       effective_recurrent_to_cell_scale_a, effective_recurrent_to_cell_scale_b,
       n_batch, n_output, n_cell, scratch1, intermediate_zp[8]);
 
   tensor_utils::TwoGateSaturationgAdd(
       scratch0, intermediate_zp[7], scratch1, intermediate_zp[8],
       intermediate_scale_a[4], intermediate_scale_b[4], intermediate_scale_a[5],
-      intermediate_scale_b[5], n_batch, n_cell, scratch3);
+      intermediate_scale_b[5], n_batch, n_cell, cell_gate_scratch);
 
-  // Update gate with layer norm.
+  // Cell gate layer norm.
   tensor_utils::ApplyLayerNormFloat(
-      scratch3, layer_norm_cell_weight_ptr, layer_norm_cell_scale_a,
-      layer_norm_cell_scale_b, cell_gate_bias_ptr, n_batch, n_cell, scratch3);
+      cell_gate_scratch, layer_norm_cell_weight_ptr, layer_norm_cell_scale_a,
+      layer_norm_cell_scale_b, cell_gate_bias_ptr, n_batch, n_cell,
+      cell_gate_scratch);
 
-  // Update gate tanh.
-  tensor_utils::ApplyTanhFloat(scratch3, n_batch, n_cell, -12, scratch3);
+  // Cell gate tanh.
+  tensor_utils::ApplyTanhFloat(cell_gate_scratch, n_batch, n_cell, -12,
+                               cell_gate_scratch);
 
   // Output gate.
   std::fill_n(scratch0, n_batch * n_cell, 0);
@@ -1434,7 +1464,7 @@ void LstmStepInteger(
       n_batch, n_input, n_cell, scratch0, intermediate_zp[10]);
 
   tensor_utils::MatrixBatchVectorMultiply(
-      activation_ptr, activation_zp, recurrent_to_output_weight_ptr,
+      output_state_ptr, output_state_zp, recurrent_to_output_weight_ptr,
       effective_recurrent_to_output_scale_a,
       effective_recurrent_to_output_scale_b, n_batch, n_output, n_cell,
       scratch1, intermediate_zp[11]);
@@ -1442,43 +1472,48 @@ void LstmStepInteger(
   tensor_utils::TwoGateSaturationgAdd(
       scratch0, intermediate_zp[10], scratch1, intermediate_zp[11],
       intermediate_scale_a[6], intermediate_scale_b[6], intermediate_scale_a[7],
-      intermediate_scale_b[7], n_batch, n_cell, scratch4);
+      intermediate_scale_b[7], n_batch, n_cell, output_gate_scratch);
 
   // Output gate with layer norm.
   tensor_utils::ApplyLayerNormFloat(
-      scratch4, layer_norm_output_weight_ptr, layer_norm_output_scale_a,
-      layer_norm_output_scale_b, output_bias_ptr, n_batch, n_cell, scratch4);
+      output_gate_scratch, layer_norm_output_weight_ptr,
+      layer_norm_output_scale_a, layer_norm_output_scale_b,
+      output_gate_bias_ptr, n_batch, n_cell, output_gate_scratch);
 
   // Output gate sigmoid.
-  tensor_utils::ApplySigmoidFloat(scratch4, n_batch, n_cell, scratch4);
+  tensor_utils::ApplySigmoidFloat(output_gate_scratch, n_batch, n_cell,
+                                  output_gate_scratch);
 
   // Input gate with cifg
-  tensor_utils::Sub1Vector(scratch2, n_batch * n_cell, scratch5);
+  tensor_utils::Sub1Vector(forget_gate_scratch, n_batch * n_cell,
+                           input_gate_scratch);
 
   // New cell.
-  tensor_utils::CwiseMul(scratch2, cell_ptr, n_batch, n_cell, 15 + 15 - 15,
-                         scratch6);
+  tensor_utils::CwiseMul(forget_gate_scratch, cell_state_ptr, n_batch, n_cell,
+                         15 + 15 - 15, scratch6);
 
-  tensor_utils::CwiseMul(scratch5, scratch3, n_batch, n_cell, 15 + 15 - 15,
-                         scratch7);
+  tensor_utils::CwiseMul(input_gate_scratch, cell_gate_scratch, n_batch, n_cell,
+                         15 + 15 - 15, scratch7);
 
-  tensor_utils::CwiseAdd(scratch6, scratch7, n_batch, n_cell, cell_ptr);
+  tensor_utils::CwiseAdd(scratch6, scratch7, n_batch, n_cell, cell_state_ptr);
 
   if (quantized_cell_clip > 0) {
-    tensor_utils::CwiseClipping(cell_ptr, quantized_cell_clip, n_batch, n_cell);
+    tensor_utils::CwiseClipping(cell_state_ptr, quantized_cell_clip, n_batch,
+                                n_cell);
   }
 
   // Cell to hidden.
-  tensor_utils::ApplyTanhFloat(cell_ptr, n_batch, n_cell, -15, scratch2);
+  tensor_utils::ApplyTanhFloat(cell_state_ptr, n_batch, n_cell, -15,
+                               forget_gate_scratch);
 
-  std::vector<int16_t> hidden(n_batch * n_cell);
-  tensor_utils::CwiseMul(scratch4, scratch2, n_batch, n_cell, 15 + 15 - 15,
-                         scratch3);
+  tensor_utils::CwiseMul(output_gate_scratch, forget_gate_scratch, n_batch,
+                         n_cell, 15 + 15 - 15, cell_gate_scratch);
 
   // Projection.
   tensor_utils::MatrixBatchVectorMultiply(
-      scratch3, proj_weight_ptr, effective_proj_scale_a, effective_proj_scale_b,
-      proj_bias_ptr, n_batch, n_cell, n_output, activation_zp, output_ptr);
+      cell_gate_scratch, projection_weight_ptr, effective_proj_scale_a,
+      effective_proj_scale_b, projection_bias_ptr, n_batch, n_cell, n_output,
+      output_state_zp, output_ptr);
 
   // Projection clipping.
   if (quantized_proj_clip > 0) {
@@ -1486,8 +1521,8 @@ void LstmStepInteger(
                                 n_output);
   }
 
-  // Copy output to activation.
-  std::copy_n(output_ptr, n_batch * n_output, activation_ptr);
+  // Copy output to output state.
+  std::copy_n(output_ptr, n_batch * n_output, output_state_ptr);
 }
 
 }  // namespace
@@ -1515,12 +1550,11 @@ TfLiteStatus EvalFloat(
     const TfLiteTensor* aux_input_to_cell_weights,
     const TfLiteTensor* aux_input_to_output_weights,
     const TfLiteTensor* input_gate_bias, const TfLiteTensor* forget_gate_bias,
-    const TfLiteTensor* cell_bias, const TfLiteTensor* output_gate_bias,
+    const TfLiteTensor* cell_gate_bias, const TfLiteTensor* output_gate_bias,
     const TfLiteTensor* projection_weights, const TfLiteTensor* projection_bias,
     const TfLiteLSTMParams* params, bool forward_sequence, bool time_major,
-    int output_offset, TfLiteTensor* scratch_buffer,
-    TfLiteTensor* activation_state, TfLiteTensor* cell_state,
-    TfLiteTensor* output) {
+    int output_offset, TfLiteTensor* scratch_buffer, TfLiteTensor* output_state,
+    TfLiteTensor* cell_state, TfLiteTensor* output) {
   TF_LITE_ASSERT(input->dims->size >= 2 && input->dims->size <= 3);
   int max_time, n_batch;
   if (input->dims->size == 3) {
@@ -1599,15 +1633,14 @@ TfLiteStatus EvalFloat(
           GetTensorData<float>(output_layer_norm_coefficients),
           GetTensorData<float>(input_gate_bias),
           GetTensorData<float>(forget_gate_bias),
-          GetTensorData<float>(cell_bias),
+          GetTensorData<float>(cell_gate_bias),
           GetTensorData<float>(output_gate_bias),
           GetTensorData<float>(projection_weights),
           GetTensorData<float>(projection_bias), params, n_batch, n_cell,
           n_input, aux_input_size, n_output, output_batch_leading_dim,
-          GetTensorData<float>(activation_state),
-          GetTensorData<float>(cell_state), input_gate_scratch,
-          forget_gate_scratch, cell_gate_scratch, output_gate_scratch,
-          output_ptr);
+          GetTensorData<float>(output_state), GetTensorData<float>(cell_state),
+          input_gate_scratch, forget_gate_scratch, cell_gate_scratch,
+          output_gate_scratch, output_ptr);
     }
   } else {
     for (int b = 0; b < n_batch; b++) {
@@ -1628,9 +1661,9 @@ TfLiteStatus EvalFloat(
         float* output_ptr = GetTensorData<float>(output) +
                             time_offset * output_step + output_offset;
 
-        // Offset the {activation,cell}_state pointers to the right batch.
-        float* activation_state_ptr = GetTensorData<float>(activation_state) +
-                                      b * output_batch_leading_dim;
+        // Offset the {output,cell}_state pointers to the right batch.
+        float* output_state_ptr =
+            GetTensorData<float>(output_state) + b * output_batch_leading_dim;
         float* cell_state_ptr = GetTensorData<float>(cell_state) + b * n_cell;
         // Offset the scratch pointers to the right batch.
         float* input_gate_scratch_ptr =
@@ -1661,12 +1694,12 @@ TfLiteStatus EvalFloat(
             GetTensorData<float>(output_layer_norm_coefficients),
             GetTensorData<float>(input_gate_bias),
             GetTensorData<float>(forget_gate_bias),
-            GetTensorData<float>(cell_bias),
+            GetTensorData<float>(cell_gate_bias),
             GetTensorData<float>(output_gate_bias),
             GetTensorData<float>(projection_weights),
             GetTensorData<float>(projection_bias), params, /*n_batch=*/1,
             n_cell, n_input, aux_input_size, n_output, output_batch_leading_dim,
-            activation_state_ptr, cell_state_ptr, input_gate_scratch_ptr,
+            output_state_ptr, cell_state_ptr, input_gate_scratch_ptr,
             forget_gate_scratch_ptr, cell_gate_scratch_ptr,
             output_gate_scratch_ptr, output_ptr);
       }
@@ -1698,7 +1731,7 @@ TfLiteStatus EvalHybrid(
     const TfLiteTensor* aux_input_to_cell_weights,
     const TfLiteTensor* aux_input_to_output_weights,
     const TfLiteTensor* input_gate_bias, const TfLiteTensor* forget_gate_bias,
-    const TfLiteTensor* cell_bias, const TfLiteTensor* output_gate_bias,
+    const TfLiteTensor* cell_gate_bias, const TfLiteTensor* output_gate_bias,
     const TfLiteTensor* projection_weights, const TfLiteTensor* projection_bias,
     const TfLiteLSTMParams* params, bool forward_sequence, bool time_major,
     int output_offset, TfLiteTensor* scratch_buffer,
@@ -1807,7 +1840,7 @@ TfLiteStatus EvalHybrid(
           GetTensorData<float>(output_layer_norm_coefficients),
           GetTensorData<float>(input_gate_bias),
           GetTensorData<float>(forget_gate_bias),
-          GetTensorData<float>(cell_bias),
+          GetTensorData<float>(cell_gate_bias),
           GetTensorData<float>(output_gate_bias),
           GetTensorData<int8_t>(projection_weights),
           GetTensorScale(projection_weights),
@@ -1893,7 +1926,7 @@ TfLiteStatus EvalHybrid(
             GetTensorData<float>(output_layer_norm_coefficients),
             GetTensorData<float>(input_gate_bias),
             GetTensorData<float>(forget_gate_bias),
-            GetTensorData<float>(cell_bias),
+            GetTensorData<float>(cell_gate_bias),
             GetTensorData<float>(output_gate_bias),
             GetTensorData<int8_t>(projection_weights),
             GetTensorScale(projection_weights),
@@ -1935,14 +1968,14 @@ TfLiteStatus EvalInteger8x8_16(
     const TfLiteTensor* cell_layer_norm_coefficients,
     const TfLiteTensor* output_layer_norm_coefficients,
     const TfLiteTensor* input_gate_bias, const TfLiteTensor* forget_gate_bias,
-    const TfLiteTensor* cell_bias, const TfLiteTensor* output_gate_bias,
+    const TfLiteTensor* cell_gate_bias, const TfLiteTensor* output_gate_bias,
     const TfLiteTensor* projection_weights, const TfLiteTensor* projection_bias,
     const TfLiteLSTMParams* params,
     const lstm_eval::IntegerLstmParameter* integer_lstm_param,
-    TfLiteTensor* activation_state, TfLiteTensor* cell_state,
-    TfLiteTensor* output, TfLiteTensor* scratch0, TfLiteTensor* scratch1,
-    TfLiteTensor* scratch2, TfLiteTensor* scratch3, TfLiteTensor* scratch4,
-    TfLiteTensor* scratch5, CpuBackendContext* context) {
+    TfLiteTensor* output_state, TfLiteTensor* cell_state, TfLiteTensor* output,
+    TfLiteTensor* scratch0, TfLiteTensor* scratch1, TfLiteTensor* scratch2,
+    TfLiteTensor* scratch3, TfLiteTensor* scratch4, TfLiteTensor* scratch5,
+    CpuBackendContext* context) {
   TF_LITE_ASSERT(input->dims->size >= 2 && input->dims->size <= 3);
   const int n_input = input->dims->data[input->dims->size - 1];
   int max_time, n_batch;
@@ -1959,7 +1992,7 @@ TfLiteStatus EvalInteger8x8_16(
   const int n_output = recurrent_to_output_weights->dims->data[1];
 
   // Activation zero point
-  int activation_zp = activation_state->params.zero_point;
+  int output_state_zp = output_state->params.zero_point;
 
   // Get params for time/batch/sequence.
   const int output_batch_leading_dim =
@@ -1971,7 +2004,7 @@ TfLiteStatus EvalInteger8x8_16(
     const int t_rel = t;
     int8_t* output_ptr = GetTensorData<int8_t>(output) + t_rel * output_step;
     const int8_t* input_ptr = GetTensorData<int8_t>(input) + t_rel * input_step;
-    LstmStepInteger(
+    LstmStepInteger8x8_16(
         input_ptr, GetTensorData<int8_t>(input_to_input_weights),
         integer_lstm_param->effective_input_to_input_scale_a,
         integer_lstm_param->effective_input_to_input_scale_b,
@@ -2025,7 +2058,7 @@ TfLiteStatus EvalInteger8x8_16(
         integer_lstm_param->layer_norm_output_scale_b,
         GetTensorData<int32_t>(input_gate_bias),
         GetTensorData<int32_t>(forget_gate_bias),
-        GetTensorData<int32_t>(cell_bias),
+        GetTensorData<int32_t>(cell_gate_bias),
         GetTensorData<int32_t>(output_gate_bias),
         integer_lstm_param->quantized_cell_clip,
         integer_lstm_param->quantized_proj_clip, integer_lstm_param->cell_scale,
@@ -2042,8 +2075,8 @@ TfLiteStatus EvalInteger8x8_16(
         integer_lstm_param->input_to_input_effective_bias.get(),
         integer_lstm_param->recurrent_to_input_effective_bias.get(),
         integer_lstm_param->projection_effective_bias.get(), n_batch, n_cell,
-        n_input, n_output, GetTensorData<int8_t>(activation_state),
-        activation_zp, GetTensorData<int16_t>(cell_state), output_ptr,
+        n_input, n_output, GetTensorData<int8_t>(output_state), output_state_zp,
+        GetTensorData<int16_t>(cell_state), output_ptr,
         GetTensorData<int16_t>(scratch0), GetTensorData<int16_t>(scratch1),
         GetTensorData<int16_t>(scratch2), GetTensorData<int16_t>(scratch3),
         GetTensorData<int8_t>(scratch4), GetTensorData<int32_t>(scratch5),
@@ -2070,9 +2103,9 @@ TfLiteStatus EvalInteger8x8_8(
     const TfLiteTensor* cell_layer_norm_coefficients,
     const TfLiteTensor* output_layer_norm_coefficients,
     const TfLiteTensor* input_gate_bias, const TfLiteTensor* forget_gate_bias,
-    const TfLiteTensor* cell_bias, const TfLiteTensor* output_gate_bias,
+    const TfLiteTensor* cell_gate_bias, const TfLiteTensor* output_gate_bias,
     const TfLiteTensor* projection_weights, const TfLiteTensor* projection_bias,
-    const TfLiteLSTMParams* params, TfLiteTensor* activation_state,
+    const TfLiteLSTMParams* params, TfLiteTensor* output_state,
     TfLiteTensor* cell_state, TfLiteTensor* output,
     const lstm_eval::IntegerLstmParameter* integer_lstm_param,
     TfLiteTensor* scratch0, TfLiteTensor* scratch1, TfLiteTensor* scratch2,
@@ -2093,49 +2126,8 @@ TfLiteStatus EvalInteger8x8_8(
   const int n_cell = input_to_output_weights->dims->data[0];
   const int n_output = recurrent_to_output_weights->dims->data[1];
 
-  // Weights and states.
-  const int8_t* input_to_input_weight_ptr =
-      GetTensorData<int8_t>(input_to_input_weights);
-  const int8_t* recurrent_to_input_weight_ptr =
-      GetTensorData<int8_t>(recurrent_to_input_weights);
-  const int8_t* cell_to_input_weight_ptr =
-      GetTensorData<int8_t>(cell_to_input_weights);
-  const int8_t* input_to_forget_weight_ptr =
-      GetTensorData<int8_t>(input_to_forget_weights);
-  const int8_t* recurrent_to_forget_weight_ptr =
-      GetTensorData<int8_t>(recurrent_to_forget_weights);
-  const int8_t* cell_to_forget_weight_ptr =
-      GetTensorData<int8_t>(cell_to_forget_weights);
-  const int8_t* input_to_cell_weight_ptr =
-      GetTensorData<int8_t>(input_to_cell_weights);
-  const int8_t* recurrent_to_cell_weight_ptr =
-      GetTensorData<int8_t>(recurrent_to_cell_weights);
-  const int8_t* input_to_output_weight_ptr =
-      GetTensorData<int8_t>(input_to_output_weights);
-  const int8_t* recurrent_to_output_weight_ptr =
-      GetTensorData<int8_t>(recurrent_to_output_weights);
-  const int8_t* cell_to_output_weight_ptr =
-      GetTensorData<int8_t>(cell_to_output_weights);
-  const int8_t* proj_weight_ptr = GetTensorData<int8_t>(projection_weights);
-  const int16_t* layer_norm_input_weight_ptr =
-      GetTensorData<int16_t>(input_layer_norm_coefficients);
-  const int16_t* layer_norm_forget_weight_ptr =
-      GetTensorData<int16_t>(forget_layer_norm_coefficients);
-  const int16_t* layer_norm_cell_weight_ptr =
-      GetTensorData<int16_t>(cell_layer_norm_coefficients);
-  const int16_t* layer_norm_output_weight_ptr =
-      GetTensorData<int16_t>(output_layer_norm_coefficients);
-  const int32_t* input_bias_ptr = GetTensorData<int32_t>(input_gate_bias);
-  const int32_t* forget_bias_ptr = GetTensorData<int32_t>(forget_gate_bias);
-  const int32_t* cell_gate_bias_ptr = GetTensorData<int32_t>(cell_bias);
-  const int32_t* output_bias_ptr = GetTensorData<int32_t>(output_gate_bias);
-  const int32_t* proj_bias_ptr = GetTensorData<int32_t>(projection_bias);
-  int16_t* cell_ptr = GetTensorData<int16_t>(cell_state);
-  int8_t* activation_ptr = GetTensorData<int8_t>(activation_state);
-  int8_t* output_ptr = nullptr;
-
-  const int32 input_zp = input->params.zero_point;
-  const int32 activation_zp = activation_state->params.zero_point;
+  const int32_t input_zp = input->params.zero_point;
+  const int32_t output_state_zp = output_state->params.zero_point;
 
   // Get params for time/batch/sequence.
   const int output_batch_leading_dim =
@@ -2145,89 +2137,93 @@ TfLiteStatus EvalInteger8x8_8(
 
   for (int t = 0; t < max_time; t++) {
     const int t_rel = t;
-    output_ptr = output->data.int8 + t_rel * output_step;
-
+    int8_t* output_ptr = GetTensorData<int8_t>(output) + t_rel * output_step;
     // Input can be int8 asymmetric or int16 symmetric.
-    const int8_t* input_ptr = input->data.int8 + t_rel * input_step;
-    lstm_eval::LstmStepInteger(
+    const int8_t* input_ptr = GetTensorData<int8_t>(input) + t_rel * input_step;
+    lstm_eval::LstmStepInteger8x8_8(
         input_ptr, input_zp,
 
-        input_to_input_weight_ptr,
+        GetTensorData<int8_t>(input_to_input_weights),
         integer_lstm_param->effective_input_to_input_scale_a,
         integer_lstm_param->effective_input_to_input_scale_b,
 
-        input_to_forget_weight_ptr,
+        GetTensorData<int8_t>(input_to_forget_weights),
         integer_lstm_param->effective_input_to_forget_scale_a,
         integer_lstm_param->effective_input_to_forget_scale_b,
 
-        input_to_cell_weight_ptr,
+        GetTensorData<int8_t>(input_to_cell_weights),
         integer_lstm_param->effective_input_to_cell_scale_a,
         integer_lstm_param->effective_input_to_cell_scale_b,
 
-        input_to_output_weight_ptr,
+        GetTensorData<int8_t>(input_to_output_weights),
         integer_lstm_param->effective_input_to_output_scale_a,
         integer_lstm_param->effective_input_to_output_scale_b,
 
-        recurrent_to_input_weight_ptr,
+        GetTensorData<int8_t>(recurrent_to_input_weights),
         integer_lstm_param->effective_recurrent_to_input_scale_a,
         integer_lstm_param->effective_recurrent_to_input_scale_b,
 
-        recurrent_to_forget_weight_ptr,
+        GetTensorData<int8_t>(recurrent_to_forget_weights),
         integer_lstm_param->effective_recurrent_to_forget_scale_a,
         integer_lstm_param->effective_recurrent_to_forget_scale_b,
 
-        recurrent_to_cell_weight_ptr,
+        GetTensorData<int8_t>(recurrent_to_cell_weights),
         integer_lstm_param->effective_recurrent_to_cell_scale_a,
         integer_lstm_param->effective_recurrent_to_cell_scale_b,
 
-        recurrent_to_output_weight_ptr,
+        GetTensorData<int8_t>(recurrent_to_output_weights),
         integer_lstm_param->effective_recurrent_to_output_scale_a,
         integer_lstm_param->effective_recurrent_to_output_scale_b,
 
-        cell_to_input_weight_ptr,
+        GetTensorData<int8_t>(cell_to_input_weights),
         integer_lstm_param->effective_cell_to_input_scale_a,
         integer_lstm_param->effective_cell_to_input_scale_b,
 
-        cell_to_forget_weight_ptr,
+        GetTensorData<int8_t>(cell_to_forget_weights),
         integer_lstm_param->effective_cell_to_forget_scale_a,
         integer_lstm_param->effective_cell_to_forget_scale_b,
 
-        cell_to_output_weight_ptr,
+        GetTensorData<int8_t>(cell_to_output_weights),
         integer_lstm_param->effective_cell_to_output_scale_a,
         integer_lstm_param->effective_cell_to_output_scale_b,
 
-        proj_weight_ptr, integer_lstm_param->effective_proj_scale_a,
+        GetTensorData<int8_t>(projection_weights),
+        integer_lstm_param->effective_proj_scale_a,
         integer_lstm_param->effective_proj_scale_b,
 
-        layer_norm_input_weight_ptr,
+        GetTensorData<int16_t>(input_layer_norm_coefficients),
         integer_lstm_param->layer_norm_input_scale_a,
         integer_lstm_param->layer_norm_input_scale_b,
 
-        layer_norm_forget_weight_ptr,
+        GetTensorData<int16_t>(forget_layer_norm_coefficients),
         integer_lstm_param->layer_norm_forget_scale_a,
         integer_lstm_param->layer_norm_forget_scale_b,
 
-        layer_norm_cell_weight_ptr, integer_lstm_param->layer_norm_cell_scale_a,
+        GetTensorData<int16_t>(cell_layer_norm_coefficients),
+        integer_lstm_param->layer_norm_cell_scale_a,
         integer_lstm_param->layer_norm_cell_scale_b,
 
-        layer_norm_output_weight_ptr,
+        GetTensorData<int16_t>(output_layer_norm_coefficients),
         integer_lstm_param->layer_norm_output_scale_a,
         integer_lstm_param->layer_norm_output_scale_b,
 
-        input_bias_ptr, forget_bias_ptr, cell_gate_bias_ptr, output_bias_ptr,
-        proj_bias_ptr,
+        GetTensorData<int32_t>(input_gate_bias),
+        GetTensorData<int32_t>(forget_gate_bias),
+        GetTensorData<int32_t>(cell_gate_bias),
+        GetTensorData<int32_t>(output_gate_bias),
+        GetTensorData<int32_t>(projection_bias),
 
         params, integer_lstm_param->intermediate_scale_a,
         integer_lstm_param->intermediate_scale_b,
         integer_lstm_param->intermediate_zp,
         integer_lstm_param->quantized_cell_clip,
         integer_lstm_param->quantized_proj_clip, n_batch, n_cell, n_input,
-        n_output, output_batch_leading_dim, activation_ptr, activation_zp,
-        cell_ptr, output_ptr, GetTensorData<int8_t>(scratch0),
-        GetTensorData<int8_t>(scratch1), GetTensorData<int16_t>(scratch2),
-        GetTensorData<int16_t>(scratch3), GetTensorData<int16_t>(scratch4),
-        GetTensorData<int16_t>(scratch5), GetTensorData<int16_t>(scratch6),
-        GetTensorData<int16_t>(scratch7));
+        n_output, output_batch_leading_dim, GetTensorData<int8_t>(output_state),
+        output_state_zp, GetTensorData<int16_t>(cell_state), output_ptr,
+        GetTensorData<int8_t>(scratch0), GetTensorData<int8_t>(scratch1),
+        GetTensorData<int16_t>(scratch2), GetTensorData<int16_t>(scratch3),
+        GetTensorData<int16_t>(scratch4), GetTensorData<int16_t>(scratch5),
+        GetTensorData<int16_t>(scratch6), GetTensorData<int16_t>(scratch7));
   }
 
   return kTfLiteOk;
diff --git a/tensorflow/lite/kernels/lstm_eval.h b/tensorflow/lite/kernels/lstm_eval.h
index 91f47b18df6..9b3bd0c54ec 100644
--- a/tensorflow/lite/kernels/lstm_eval.h
+++ b/tensorflow/lite/kernels/lstm_eval.h
@@ -117,12 +117,11 @@ TfLiteStatus EvalFloat(
     const TfLiteTensor* aux_input_to_cell_weights,
     const TfLiteTensor* aux_input_to_output_weights,
     const TfLiteTensor* input_gate_bias, const TfLiteTensor* forget_gate_bias,
-    const TfLiteTensor* cell_bias, const TfLiteTensor* output_gate_bias,
+    const TfLiteTensor* cell_gate_bias, const TfLiteTensor* output_gate_bias,
     const TfLiteTensor* projection_weights, const TfLiteTensor* projection_bias,
     const TfLiteLSTMParams* params, bool forward_sequence, bool time_major,
-    int output_offset, TfLiteTensor* scratch_buffer,
-    TfLiteTensor* activation_state, TfLiteTensor* cell_state,
-    TfLiteTensor* output);
+    int output_offset, TfLiteTensor* scratch_buffer, TfLiteTensor* output_state,
+    TfLiteTensor* cell_state, TfLiteTensor* output);
 
 TfLiteStatus EvalHybrid(
     const TfLiteTensor* input, const TfLiteTensor* input_to_input_weights,
@@ -146,7 +145,7 @@ TfLiteStatus EvalHybrid(
     const TfLiteTensor* aux_input_to_cell_weights,
     const TfLiteTensor* aux_input_to_output_weights,
     const TfLiteTensor* input_gate_bias, const TfLiteTensor* forget_gate_bias,
-    const TfLiteTensor* cell_bias, const TfLiteTensor* output_gate_bias,
+    const TfLiteTensor* cell_gate_bias, const TfLiteTensor* output_gate_bias,
     const TfLiteTensor* projection_weights, const TfLiteTensor* projection_bias,
     const TfLiteLSTMParams* params, bool forward_sequence, bool time_major,
     int output_offset, TfLiteTensor* scratch_buffer,
@@ -175,14 +174,14 @@ TfLiteStatus EvalInteger8x8_16(
     const TfLiteTensor* cell_layer_norm_coefficients,
     const TfLiteTensor* output_layer_norm_coefficients,
     const TfLiteTensor* input_gate_bias, const TfLiteTensor* forget_gate_bias,
-    const TfLiteTensor* cell_bias, const TfLiteTensor* output_gate_bias,
+    const TfLiteTensor* cell_gate_bias, const TfLiteTensor* output_gate_bias,
     const TfLiteTensor* projection_weights, const TfLiteTensor* projection_bias,
     const TfLiteLSTMParams* params,
     const lstm_eval::IntegerLstmParameter* integer_lstm_param,
-    TfLiteTensor* activation_state, TfLiteTensor* cell_state,
-    TfLiteTensor* output, TfLiteTensor* scratch0, TfLiteTensor* scratch1,
-    TfLiteTensor* scratch2, TfLiteTensor* scratch3, TfLiteTensor* scratch4,
-    TfLiteTensor* scratch5, CpuBackendContext* context);
+    TfLiteTensor* output_state, TfLiteTensor* cell_state, TfLiteTensor* output,
+    TfLiteTensor* scratch0, TfLiteTensor* scratch1, TfLiteTensor* scratch2,
+    TfLiteTensor* scratch3, TfLiteTensor* scratch4, TfLiteTensor* scratch5,
+    CpuBackendContext* context);
 
 TfLiteStatus EvalInteger8x8_8(
     const TfLiteTensor* input, const TfLiteTensor* input_to_input_weights,
@@ -201,9 +200,9 @@ TfLiteStatus EvalInteger8x8_8(
     const TfLiteTensor* cell_layer_norm_coefficients,
     const TfLiteTensor* output_layer_norm_coefficients,
     const TfLiteTensor* input_gate_bias, const TfLiteTensor* forget_gate_bias,
-    const TfLiteTensor* cell_bias, const TfLiteTensor* output_gate_bias,
+    const TfLiteTensor* cell_gate_bias, const TfLiteTensor* output_gate_bias,
     const TfLiteTensor* projection_weights, const TfLiteTensor* projection_bias,
-    const TfLiteLSTMParams* params, TfLiteTensor* activation_state,
+    const TfLiteLSTMParams* params, TfLiteTensor* output_state,
     TfLiteTensor* cell_state, TfLiteTensor* output,
     const lstm_eval::IntegerLstmParameter* integer_lstm_param,
     TfLiteTensor* scratch0, TfLiteTensor* scratch1, TfLiteTensor* scratch2,
diff --git a/tensorflow/lite/kernels/lstm_eval_test.cc b/tensorflow/lite/kernels/lstm_eval_test.cc
index baf2e5e83df..78459117859 100644
--- a/tensorflow/lite/kernels/lstm_eval_test.cc
+++ b/tensorflow/lite/kernels/lstm_eval_test.cc
@@ -113,10 +113,10 @@ class BaseLstmParam {
     TfLiteIntArrayFree(layer_norm_forget_tensor_.dims);
     TfLiteIntArrayFree(layer_norm_cell_tensor_.dims);
     TfLiteIntArrayFree(layer_norm_output_tensor_.dims);
-    TfLiteIntArrayFree(input_bias_tensor_.dims);
-    TfLiteIntArrayFree(forget_bias_tensor_.dims);
-    TfLiteIntArrayFree(cell_bias_tensor_.dims);
-    TfLiteIntArrayFree(output_bias_tensor_.dims);
+    TfLiteIntArrayFree(input_gate_bias_tensor_.dims);
+    TfLiteIntArrayFree(forget_gate_bias_tensor_.dims);
+    TfLiteIntArrayFree(cell_gate_bias_tensor_.dims);
+    TfLiteIntArrayFree(output_gate_bias_tensor_.dims);
     TfLiteIntArrayFree(projection_tensor_.dims);
     TfLiteIntArrayFree(projection_bias_tensor_.dims);
     TfLiteIntArrayFree(activation_tensor_.dims);
@@ -275,17 +275,17 @@ class BaseLstmParam {
   std::vector<int32_t> layer_norm_output_size_ = {n_cell_};
   TfLiteTensor layer_norm_output_tensor_;
 
-  std::vector<int32_t> input_bias_size_ = {n_cell_};
-  TfLiteTensor input_bias_tensor_;
+  std::vector<int32_t> input_gate_bias_size_ = {n_cell_};
+  TfLiteTensor input_gate_bias_tensor_;
 
-  std::vector<int32_t> forget_bias_size_ = {n_cell_};
-  TfLiteTensor forget_bias_tensor_;
+  std::vector<int32_t> forget_gate_bias_size_ = {n_cell_};
+  TfLiteTensor forget_gate_bias_tensor_;
 
-  std::vector<int32_t> cell_bias_size_ = {n_cell_};
-  TfLiteTensor cell_bias_tensor_;
+  std::vector<int32_t> cell_gate_bias_size_ = {n_cell_};
+  TfLiteTensor cell_gate_bias_tensor_;
 
-  std::vector<int32_t> output_bias_size_ = {n_cell_};
-  TfLiteTensor output_bias_tensor_;
+  std::vector<int32_t> output_gate_bias_size_ = {n_cell_};
+  TfLiteTensor output_gate_bias_tensor_;
 
   // projection_weights.
   std::vector<int8_t> projection_ = {
@@ -350,24 +350,28 @@ class QuantizedLstmParam : public BaseLstmParam {
     return &layer_norm_output_tensor_;
   }
   TfLiteTensor* GetInputBias() {
-    PackWeightToTensor(&input_bias_tensor_, input_bias_, input_bias_size_);
-    input_bias_tensor_.data.i32 = input_bias_.data();
-    return &input_bias_tensor_;
+    PackWeightToTensor(&input_gate_bias_tensor_, input_gate_bias_,
+                       input_gate_bias_size_);
+    input_gate_bias_tensor_.data.i32 = input_gate_bias_.data();
+    return &input_gate_bias_tensor_;
   }
   TfLiteTensor* GetForgetBias() {
-    PackWeightToTensor(&forget_bias_tensor_, forget_bias_, forget_bias_size_);
-    forget_bias_tensor_.data.i32 = forget_bias_.data();
-    return &forget_bias_tensor_;
+    PackWeightToTensor(&forget_gate_bias_tensor_, forget_gate_bias_,
+                       forget_gate_bias_size_);
+    forget_gate_bias_tensor_.data.i32 = forget_gate_bias_.data();
+    return &forget_gate_bias_tensor_;
   }
   TfLiteTensor* GetCellBias() {
-    PackWeightToTensor(&cell_bias_tensor_, cell_bias_, cell_bias_size_);
-    cell_bias_tensor_.data.i32 = cell_bias_.data();
-    return &cell_bias_tensor_;
+    PackWeightToTensor(&cell_gate_bias_tensor_, cell_gate_bias_,
+                       cell_gate_bias_size_);
+    cell_gate_bias_tensor_.data.i32 = cell_gate_bias_.data();
+    return &cell_gate_bias_tensor_;
   }
   TfLiteTensor* GetOutputBias() {
-    PackWeightToTensor(&output_bias_tensor_, output_bias_, output_bias_size_);
-    output_bias_tensor_.data.i32 = output_bias_.data();
-    return &output_bias_tensor_;
+    PackWeightToTensor(&output_gate_bias_tensor_, output_gate_bias_,
+                       output_gate_bias_size_);
+    output_gate_bias_tensor_.data.i32 = output_gate_bias_.data();
+    return &output_gate_bias_tensor_;
   }
   TfLiteTensor* GetProjectionBias() {
     PackWeightToTensor(&projection_bias_tensor_, projection_bias_,
@@ -539,22 +543,22 @@ class QuantizedLstmParam : public BaseLstmParam {
   };
 
   // input_gate_bias.
-  std::vector<int32_t> input_bias_ = {
+  std::vector<int32_t> input_gate_bias_ = {
       16, 4, 5, 6, 1, 1, 3, 4, -5, 6,  //
   };
 
   // forget_gate_bias.
-  std::vector<int32_t> forget_bias_ = {
+  std::vector<int32_t> forget_gate_bias_ = {
       16, 4, 5, 6, 1, 1, 3, 4, -5, 6,  //
   };
 
-  // cell_bias.
-  std::vector<int32_t> cell_bias_ = {
+  // cell_gate_bias.
+  std::vector<int32_t> cell_gate_bias_ = {
       16, 4, 5, 6, 1, 1, 3, 4, -5, 6,  //
   };
 
   // output_gate_bias.
-  std::vector<int32_t> output_bias_ = {
+  std::vector<int32_t> output_gate_bias_ = {
       16, 4, 5, 6, 1, 1, 3, 4, -5, 6,  //
   };
 
@@ -711,27 +715,28 @@ class HybridLstmParam : public BaseLstmParam {
     return &accum_scratch_tensor_;
   }
   TfLiteTensor* GetInputBias() {
-    PackWeightToTensor(&input_bias_tensor_, input_float_bias_,
-                       input_bias_size_);
-    input_bias_tensor_.data.f = input_float_bias_.data();
-    return &input_bias_tensor_;
+    PackWeightToTensor(&input_gate_bias_tensor_, input_float_bias_,
+                       input_gate_bias_size_);
+    input_gate_bias_tensor_.data.f = input_float_bias_.data();
+    return &input_gate_bias_tensor_;
   }
   TfLiteTensor* GetForgetBias() {
-    PackWeightToTensor(&forget_bias_tensor_, forget_float_bias_,
-                       forget_bias_size_);
-    forget_bias_tensor_.data.f = forget_float_bias_.data();
-    return &forget_bias_tensor_;
+    PackWeightToTensor(&forget_gate_bias_tensor_, forget_float_bias_,
+                       forget_gate_bias_size_);
+    forget_gate_bias_tensor_.data.f = forget_float_bias_.data();
+    return &forget_gate_bias_tensor_;
   }
   TfLiteTensor* GetCellBias() {
-    PackWeightToTensor(&cell_bias_tensor_, cell_float_bias_, cell_bias_size_);
-    cell_bias_tensor_.data.f = cell_float_bias_.data();
-    return &cell_bias_tensor_;
+    PackWeightToTensor(&cell_gate_bias_tensor_, cell_float_bias_,
+                       cell_gate_bias_size_);
+    cell_gate_bias_tensor_.data.f = cell_float_bias_.data();
+    return &cell_gate_bias_tensor_;
   }
   TfLiteTensor* GetOutputBias() {
-    PackWeightToTensor(&output_bias_tensor_, output_float_bias_,
-                       output_bias_size_);
-    output_bias_tensor_.data.f = output_float_bias_.data();
-    return &output_bias_tensor_;
+    PackWeightToTensor(&output_gate_bias_tensor_, output_float_bias_,
+                       output_gate_bias_size_);
+    output_gate_bias_tensor_.data.f = output_float_bias_.data();
+    return &output_gate_bias_tensor_;
   }
   TfLiteTensor* GetProjectionBias() {
     PackWeightToTensor(&projection_bias_tensor_, projection_float_bias_,
diff --git a/tensorflow/lite/kernels/lstm_shared.h b/tensorflow/lite/kernels/lstm_shared.h
index 9e29650a3d8..0907be9094b 100644
--- a/tensorflow/lite/kernels/lstm_shared.h
+++ b/tensorflow/lite/kernels/lstm_shared.h
@@ -57,8 +57,8 @@ constexpr int kProjectionBiasTensor = 17;  // Optional
 
 // These state tensors are defined as variable tensors, and will be modified by
 // this op.
-constexpr int kInputActivationStateTensor = 18;
-constexpr int kInputCellStateTensor = 19;
+constexpr int kOutputStateTensor = 18;
+constexpr int kCellStateTensor = 19;
 
 // Layer norm coefficient tensors of size {n_cell}, representing a diagonal
 // matrix.
diff --git a/tensorflow/lite/kernels/lstm_test.cc b/tensorflow/lite/kernels/lstm_test.cc
index ba5ee6508cc..a9023dce371 100644
--- a/tensorflow/lite/kernels/lstm_test.cc
+++ b/tensorflow/lite/kernels/lstm_test.cc
@@ -89,7 +89,7 @@ class LSTMOpModel : public SingleOpModel {
       input_gate_bias_ = AddInput(TensorType_FLOAT32);
     }
     forget_gate_bias_ = AddInput(TensorType_FLOAT32);
-    cell_bias_ = AddInput(TensorType_FLOAT32);
+    cell_gate_bias_ = AddInput(TensorType_FLOAT32);
     output_gate_bias_ = AddInput(TensorType_FLOAT32);
 
     if (use_projection_weights) {
@@ -104,10 +104,10 @@ class LSTMOpModel : public SingleOpModel {
       projection_bias_ = AddNullInput();
     }
 
-    // Adding the 2 input state tensors.
-    input_activation_state_ =
+    // Adding the 2 state tensors.
+    output_state_ =
         AddInput(TensorData{TensorType_FLOAT32, {n_batch_, n_output_}}, true);
-    input_cell_state_ =
+    cell_state_ =
         AddInput(TensorData{TensorType_FLOAT32, {n_batch_, n_cell_}}, true);
 
     // Layer norm weights.
@@ -211,7 +211,7 @@ class LSTMOpModel : public SingleOpModel {
   }
 
   void SetCellBias(const std::vector<float>& f) {
-    PopulateTensor(cell_bias_, f);
+    PopulateTensor(cell_gate_bias_, f);
   }
 
   void SetOutputGateBias(const std::vector<float>& f) {
@@ -261,18 +261,16 @@ class LSTMOpModel : public SingleOpModel {
 
   int input_gate_bias_;
   int forget_gate_bias_;
-  int cell_bias_;
+  int cell_gate_bias_;
   int output_gate_bias_;
 
   int projection_weights_;
   int projection_bias_;
-  int input_activation_state_;
-  int input_cell_state_;
-
-  int output_;
   int output_state_;
   int cell_state_;
 
+  int output_;
+
   int n_batch_;
   int n_input_;
   int n_cell_;
@@ -500,7 +498,7 @@ TEST_F(NoCifgNoPeepholeNoProjectionNoClippingLstmTest, LstmBlackBoxTest) {
 
                        {n_cell},  // input_gate_bias tensor
                        {n_cell},  // forget_gate_bias tensor
-                       {n_cell},  // cell_bias tensor
+                       {n_cell},  // cell_gate_bias tensor
                        {n_cell},  // output_gate_bias tensor
 
                        {0, 0},  // projection_weight tensor
@@ -547,13 +545,13 @@ TEST_F(NoCifgNoPeepholeNoProjectionNoClippingOmittedLayerNormLstmTest,
 
                        {n_cell},  // input_gate_bias tensor
                        {n_cell},  // forget_gate_bias tensor
-                       {n_cell},  // cell_bias tensor
+                       {n_cell},  // cell_gate_bias tensor
                        {n_cell},  // output_gate_bias tensor
 
                        {0, 0},  // projection_weight tensor
                        {0},     // projection_bias tensor
 
-                       {n_batch, n_output},  // activation_state tensor
+                       {n_batch, n_output},  // output_state tensor
                        {n_batch, n_cell},    // cell_state tensor
 
                        {0},  // input_layer_norm_coefficient tensor
@@ -603,7 +601,7 @@ TEST_P(NoCifgNoPeepholeNoProjectionNoClippingLstmTest,
 
                        {n_cell},  // input_gate_bias tensor
                        {n_cell},  // forget_gate_bias tensor
-                       {n_cell},  // cell_bias tensor
+                       {n_cell},  // cell_gate_bias tensor
                        {n_cell},  // output_gate_bias tensor
 
                        {0, 0},  // projection_weight tensor
@@ -654,7 +652,7 @@ TEST_P(NoCifgNoPeepholeNoProjectionNoClippingLstmInt8Test,
 
                        {n_cell},  // input_gate_bias tensor
                        {n_cell},  // forget_gate_bias tensor
-                       {n_cell},  // cell_bias tensor
+                       {n_cell},  // cell_gate_bias tensor
                        {n_cell},  // output_gate_bias tensor
 
                        {0, 0},  // projection_weight tensor
@@ -745,7 +743,7 @@ TEST_F(CifgNoPeepholeNoProjectionNoClippingLstmTest, LstmBlackBoxTest) {
 
                        {0},       // input_gate_bias tensor
                        {n_cell},  // forget_gate_bias tensor
-                       {n_cell},  // cell_bias tensor
+                       {n_cell},  // cell_gate_bias tensor
                        {n_cell},  // output_gate_bias tensor
 
                        {0, 0},  // projection_weight tensor
@@ -793,7 +791,7 @@ TEST_P(CifgNoPeepholeNoProjectionNoClippingLstmTest,
 
                        {0},       // input_gate_bias tensor
                        {n_cell},  // forget_gate_bias tensor
-                       {n_cell},  // cell_bias tensor
+                       {n_cell},  // cell_gate_bias tensor
                        {n_cell},  // output_gate_bias tensor
 
                        {0, 0},  // projection_weight tensor
@@ -842,7 +840,7 @@ TEST_P(CifgNoPeepholeNoProjectionNoClippingLstmInt8Test,
 
                        {0},       // input_gate_bias tensor
                        {n_cell},  // forget_gate_bias tensor
-                       {n_cell},  // cell_bias tensor
+                       {n_cell},  // cell_gate_bias tensor
                        {n_cell},  // output_gate_bias tensor
 
                        {0, 0},  // projection_weight tensor
@@ -1483,7 +1481,7 @@ TEST_F(NoCifgPeepholeProjectionNoClippingLstmTest, LstmBlackBoxTest) {
 
                        {n_cell},  // input_gate_bias tensor
                        {n_cell},  // forget_gate_bias tensor
-                       {n_cell},  // cell_bias tensor
+                       {n_cell},  // cell_gate_bias tensor
                        {n_cell},  // output_gate_bias tensor
 
                        {n_output, n_cell},  // projection_weight tensor
@@ -1530,7 +1528,7 @@ TEST_P(NoCifgPeepholeProjectionNoClippingLstmTest,
 
                        {n_cell},  // input_gate_bias tensor
                        {n_cell},  // forget_gate_bias tensor
-                       {n_cell},  // cell_bias tensor
+                       {n_cell},  // cell_gate_bias tensor
                        {n_cell},  // output_gate_bias tensor
 
                        {n_output, n_cell},  // projection_weight tensor
@@ -1579,7 +1577,7 @@ TEST_P(NoCifgPeepholeProjectionNoClippingLstmInt8Test,
 
                        {n_cell},  // input_gate_bias tensor
                        {n_cell},  // forget_gate_bias tensor
-                       {n_cell},  // cell_bias tensor
+                       {n_cell},  // cell_gate_bias tensor
                        {n_cell},  // output_gate_bias tensor
 
                        {n_output, n_cell},  // projection_weight tensor
@@ -1691,13 +1689,13 @@ TEST_F(NoCifgPeepholeProjectionNoClippingLayerNormLstmTest,
 
           {n_cell},  // input_gate_bias tensor
           {n_cell},  // forget_gate_bias tensor
-          {n_cell},  // cell_bias tensor
+          {n_cell},  // cell_gate_bias tensor
           {n_cell},  // output_gate_bias tensor
 
           {n_output, n_cell},  // projection_weight tensor
           {0},                 // projection_bias tensor
 
-          {n_batch, n_output},  // activation_state tensor
+          {n_batch, n_output},  // output_state tensor
           {n_batch, n_cell},    // cell_state tensor
 
           {n_cell},  // input_layer_norm_coefficient tensor
@@ -1762,13 +1760,13 @@ TEST_P(NoCifgPeepholeProjectionNoClippingLayerNormLstmTest,
 
           {n_cell},  // input_gate_bias tensor
           {n_cell},  // forget_gate_bias tensor
-          {n_cell},  // cell_bias tensor
+          {n_cell},  // cell_gate_bias tensor
           {n_cell},  // output_gate_bias tensor
 
           {n_output, n_cell},  // projection_weight tensor
           {0},                 // projection_bias tensor
 
-          {n_batch, n_output},  // activation_state tensor
+          {n_batch, n_output},  // output_state tensor
           {n_batch, n_cell},    // cell_state tensor
 
           {n_cell},  // input_layer_norm_coefficient tensor
@@ -1835,13 +1833,13 @@ TEST_P(NoCifgPeepholeProjectionNoClippingLayerNormLstmInt8Test,
 
           {n_cell},  // input_gate_bias tensor
           {n_cell},  // forget_gate_bias tensor
-          {n_cell},  // cell_bias tensor
+          {n_cell},  // cell_gate_bias tensor
           {n_cell},  // output_gate_bias tensor
 
           {n_output, n_cell},  // projection_weight tensor
           {0},                 // projection_bias tensor
 
-          {n_batch, n_output},  // activation_state tensor
+          {n_batch, n_output},  // output_state tensor
           {n_batch, n_cell},    // cell_state tensor
 
           {n_cell},  // input_layer_norm_coefficient tensor
@@ -1949,13 +1947,13 @@ TEST_F(CifgPeepholeProjectionNoClippingLayerNormLstmTest,
 
           {0},       // input_gate_bias tensor
           {n_cell},  // forget_gate_bias tensor
-          {n_cell},  // cell_bias tensor
+          {n_cell},  // cell_gate_bias tensor
           {n_cell},  // output_gate_bias tensor
 
           {n_output, n_cell},  // projection_weight tensor
           {0},                 // projection_bias tensor
 
-          {n_batch, n_output},  // activation_state tensor
+          {n_batch, n_output},  // output_state tensor
           {n_batch, n_cell},    // cell_state tensor
 
           {0},       // input_layer_norm_coefficient tensor
@@ -2020,13 +2018,13 @@ TEST_P(CifgPeepholeProjectionNoClippingLayerNormLstmTest,
 
           {0},       // input_gate_bias tensor
           {n_cell},  // forget_gate_bias tensor
-          {n_cell},  // cell_bias tensor
+          {n_cell},  // cell_gate_bias tensor
           {n_cell},  // output_gate_bias tensor
 
           {n_output, n_cell},  // projection_weight tensor
           {0},                 // projection_bias tensor
 
-          {n_batch, n_output},  // activation_state tensor
+          {n_batch, n_output},  // output_state tensor
           {n_batch, n_cell},    // cell_state tensor
 
           {0},       // input_layer_norm_coefficient tensor
@@ -2092,13 +2090,13 @@ TEST_P(CifgPeepholeProjectionNoClippingLayerNormLstmInt8Test,
 
           {0},       // input_gate_bias tensor
           {n_cell},  // forget_gate_bias tensor
-          {n_cell},  // cell_bias tensor
+          {n_cell},  // cell_gate_bias tensor
           {n_cell},  // output_gate_bias tensor
 
           {n_output, n_cell},  // projection_weight tensor
           {0},                 // projection_bias tensor
 
-          {n_batch, n_output},  // activation_state tensor
+          {n_batch, n_output},  // output_state tensor
           {n_batch, n_cell},    // cell_state tensor
 
           {0},       // input_layer_norm_coefficient tensor
@@ -2197,8 +2195,8 @@ class LSTMIntegerOpModel : public SingleOpModel {
     }
     forget_gate_bias_ = AddInput({TensorType_INT32, input_shapes[13],
                                   ranges[13].first, ranges[13].second});
-    cell_bias_ = AddInput({TensorType_INT32, input_shapes[14], ranges[14].first,
-                           ranges[14].second});
+    cell_gate_bias_ = AddInput({TensorType_INT32, input_shapes[14],
+                                ranges[14].first, ranges[14].second});
     output_gate_bias_ = AddInput({TensorType_INT32, input_shapes[15],
                                   ranges[15].first, ranges[15].second});
 
@@ -2216,13 +2214,13 @@ class LSTMIntegerOpModel : public SingleOpModel {
       projection_bias_ = AddNullInput();
     }
 
-    // Adding the 2 input state tensors.
-    input_activation_state_ = AddInput({TensorType_INT16, input_shapes[18],
-                                        ranges[18].first, ranges[18].second},
-                                       true);
-    input_cell_state_ = AddInput({TensorType_INT16, input_shapes[19],
-                                  ranges[19].first, ranges[19].second},
-                                 true);
+    // Adding the 2 state tensors.
+    output_state_ = AddInput({TensorType_INT16, input_shapes[18],
+                              ranges[18].first, ranges[18].second},
+                             true);
+    cell_state_ = AddInput({TensorType_INT16, input_shapes[19],
+                            ranges[19].first, ranges[19].second},
+                           true);
 
     // Layer norm weights.
     if (use_layer_norm) {
@@ -2332,7 +2330,7 @@ class LSTMIntegerOpModel : public SingleOpModel {
   }
 
   void SetCellBias(const std::vector<float>& f) {
-    QuantizeAndPopulate<int32_t>(cell_bias_, f);
+    QuantizeAndPopulate<int32_t>(cell_gate_bias_, f);
   }
 
   void SetOutputGateBias(const std::vector<float>& f) {
@@ -2381,13 +2379,11 @@ class LSTMIntegerOpModel : public SingleOpModel {
 
   int input_gate_bias_;
   int forget_gate_bias_;
-  int cell_bias_;
+  int cell_gate_bias_;
   int output_gate_bias_;
 
   int projection_weights_;
   int projection_bias_;
-  int input_activation_state_;
-  int input_cell_state_;
 
   int intermediates_[5];
 
@@ -2477,13 +2473,13 @@ TEST(LSTMIntegerOpModel, NoCifgYesLayerNormNoYesProjectionNoPeephole) {
 
       {n_cell},  // input_gate_bias tensor
       {n_cell},  // forget_gate_bias tensor
-      {n_cell},  // cell_bias tensor
+      {n_cell},  // cell_gate_bias tensor
       {n_cell},  // output_gate_bias tensor
 
       {n_output, n_cell},  // projection_weight tensor
       {0},                 // projection_bias tensor
 
-      {n_batch, n_output},  // activation_state tensor
+      {n_batch, n_output},  // output_state tensor
       {n_batch, n_cell},    // cell_state tensor
 
       {n_cell},  // input_layer_norm_coefficient tensor
@@ -2511,20 +2507,20 @@ TEST(LSTMIntegerOpModel, NoCifgYesLayerNormNoYesProjectionNoPeephole) {
 
       {-100, 100},  // input_gate_bias tensor
       {-100, 100},  // forget_gate_bias tensor
-      {-100, 100},  // cell_bias tensor
+      {-100, 100},  // cell_gate_bias tensor
       {-100, 100},  // output_gate_bias tensor
 
       {-0.5, 0.5},  // projection_weight tensor
       {-1, 1},      // projection_bias tensor
 
-      {-1.0, 32767.0 / 32768},  // activation_state tensor
+      {-1.0, 32767.0 / 32768},  // output_state tensor
       {-1, 1},                  // cell_state tensor
 
       {-1.00001, 1.0},  // input_layer_norm_coefficient tensor
       {-1.00001, 1.0},  // forget_layer_norm_coefficient tensor
       {-1.00001, 1.0},  // cell_layer_norm_coefficient tensor
       {-1.00001, 1.0},  // output_layer_norm_coefficient tensor
-      // Output scale is the same as input activation scale and only activation
+      // Output scale is the same as output_state scale and only output_state
       // scale is used in the op, so this is only provided for clarity.
       {-1.0, 32767.0 / 32768},  // output tensor.
   };
@@ -2679,13 +2675,13 @@ TEST(LSTMIntegerOpModel, NoCifgYesLayerNormNoYesProjectionYesPeephole) {
 
       {n_cell},  // input_gate_bias tensor
       {n_cell},  // forget_gate_bias tensor
-      {n_cell},  // cell_bias tensor
+      {n_cell},  // cell_gate_bias tensor
       {n_cell},  // output_gate_bias tensor
 
       {n_output, n_cell},  // projection_weight tensor
       {0},                 // projection_bias tensor
 
-      {n_batch, n_output},  // activation_state tensor
+      {n_batch, n_output},  // output_state tensor
       {n_batch, n_cell},    // cell_state tensor
 
       {n_cell},  // input_layer_norm_coefficient tensor
@@ -2713,20 +2709,20 @@ TEST(LSTMIntegerOpModel, NoCifgYesLayerNormNoYesProjectionYesPeephole) {
 
       {-100, 100},  // input_gate_bias tensor
       {-100, 80},   // forget_gate_bias tensor
-      {-100, 100},  // cell_bias tensor
+      {-100, 100},  // cell_gate_bias tensor
       {-100, 100},  // output_gate_bias tensor
 
       {-0.5, 0.5},  // projection_weight tensor
       {-1, 1},      // projection_bias tensor
 
-      {-1.0, 32767.0 / 32768},  // activation_state tensor
+      {-1.0, 32767.0 / 32768},  // output_state tensor
       {-1, 1},                  // cell_state tensor
 
       {-0.5, 0.5},  // input_layer_norm_coefficient tensor
       {-0.5, 0.5},  // forget_layer_norm_coefficient tensor
       {-1.0, 1.0},  // cell_layer_norm_coefficient tensor
       {-1.0, 1.0},  // output_layer_norm_coefficient tensor
-      // Output scale is the same as input activation scale and only activation
+      // Output scale is the same as output_state scale and only output_state
       // scale is used in the op, so this is only provided for clarity.
       {-1.0, 32767.0 / 32768},  // output tensor.
   };
@@ -2873,8 +2869,8 @@ class LSTMIntegerOpModel8x8_8 : public SingleOpModel {
     }
     forget_gate_bias_ = AddInput({TensorType_INT32, input_shapes[13],
                                   ranges[13].first, ranges[13].second});
-    cell_bias_ = AddInput({TensorType_INT32, input_shapes[14], ranges[14].first,
-                           ranges[14].second});
+    cell_gate_bias_ = AddInput({TensorType_INT32, input_shapes[14],
+                                ranges[14].first, ranges[14].second});
     output_gate_bias_ = AddInput({TensorType_INT32, input_shapes[15],
                                   ranges[15].first, ranges[15].second});
 
@@ -2892,13 +2888,13 @@ class LSTMIntegerOpModel8x8_8 : public SingleOpModel {
       projection_bias_ = AddNullInput();
     }
 
-    // Adding the 2 input state tensors.
-    input_activation_state_ = AddInput({TensorType_INT16, input_shapes[18],
-                                        ranges[18].first, ranges[18].second},
-                                       true);
-    input_cell_state_ = AddInput({TensorType_INT16, input_shapes[19],
-                                  ranges[19].first, ranges[19].second},
-                                 true);
+    // Adding the 2 state tensors.
+    output_state_ = AddInput({TensorType_INT16, input_shapes[18],
+                              ranges[18].first, ranges[18].second},
+                             true);
+    cell_state_ = AddInput({TensorType_INT16, input_shapes[19],
+                            ranges[19].first, ranges[19].second},
+                           true);
 
     // Layer norm weights.
     if (use_layer_norm) {
@@ -3008,7 +3004,7 @@ class LSTMIntegerOpModel8x8_8 : public SingleOpModel {
   }
 
   void SetCellBias(const std::vector<float>& f) {
-    QuantizeAndPopulate<int32_t>(cell_bias_, f);
+    QuantizeAndPopulate<int32_t>(cell_gate_bias_, f);
   }
 
   void SetOutputGateBias(const std::vector<float>& f) {
@@ -3057,13 +3053,11 @@ class LSTMIntegerOpModel8x8_8 : public SingleOpModel {
 
   int input_gate_bias_;
   int forget_gate_bias_;
-  int cell_bias_;
+  int cell_gate_bias_;
   int output_gate_bias_;
 
   int projection_weights_;
   int projection_bias_;
-  int input_activation_state_;
-  int input_cell_state_;
 
   int intermediates_[12];
 
@@ -3154,13 +3148,13 @@ TEST(LSTMIntegerOpModel8x8_8, CifgYesLayerNormNoYesProjectionNoPeephole) {
 
       {0},       // input_gate_bias tensor
       {n_cell},  // forget_gate_bias tensor
-      {n_cell},  // cell_bias tensor
+      {n_cell},  // cell_gate_bias tensor
       {n_cell},  // output_gate_bias tensor
 
       {n_output, n_cell},  // projection_weight tensor
       {n_output},          // projection_bias tensor
 
-      {n_batch, n_output},  // activation_state tensor
+      {n_batch, n_output},  // output_state tensor
       {n_batch, n_cell},    // cell_state tensor
 
       {0},       // input_layer_norm_coefficient tensor
@@ -3188,20 +3182,20 @@ TEST(LSTMIntegerOpModel8x8_8, CifgYesLayerNormNoYesProjectionNoPeephole) {
 
       {-100, 100},  // input_gate_bias tensor
       {-100, 100},  // forget_gate_bias tensor
-      {-100, 100},  // cell_bias tensor
+      {-100, 100},  // cell_gate_bias tensor
       {-100, 100},  // output_gate_bias tensor
 
       {-0.5, 0.5},  // projection_weight tensor
       {-1, 1},      // projection_bias tensor
 
-      {-1.0, 32767.0 / 32768},  // activation_state tensor
+      {-1.0, 32767.0 / 32768},  // output_state tensor
       {-1.0, 32767.0 / 32768},  // cell_state tensor
 
       {-1.00001, 1.0},  // input_layer_norm_coefficient tensor
       {-1.00001, 1.0},  // forget_layer_norm_coefficient tensor
       {-1.00001, 1.0},  // cell_layer_norm_coefficient tensor
       {-1.00001, 1.0},  // output_layer_norm_coefficient tensor
-      // Output scale is the same as input activation scale and only activation
+      // Output scale is the same as output_state scale and only output_state
       // scale is used in the op, so this is only provided for clarity.
       {-1.0, 32767.0 / 32768},  // output tensor.
   };
@@ -3309,7 +3303,7 @@ TEST(LSTMOpModel, InvalidTypeTest) {
 
                        {n_cell},  // input_gate_bias tensor
                        {n_cell},  // forget_gate_bias tensor
-                       {n_cell},  // cell_bias tensor
+                       {n_cell},  // cell_gate_bias tensor
                        {n_cell},  // output_gate_bias tensor
 
                        {0, 0},  // projection_weight tensor
@@ -3344,7 +3338,7 @@ TEST(LSTMOpModel, InvalidTypeTest) {
 
                        {n_cell},  // input_gate_bias tensor
                        {n_cell},  // forget_gate_bias tensor
-                       {n_cell},  // cell_bias tensor
+                       {n_cell},  // cell_gate_bias tensor
                        {n_cell},  // output_gate_bias tensor
 
                        {0, 0},  // projection_weight tensor
diff --git a/tensorflow/lite/kernels/maximum_minimum_test.cc b/tensorflow/lite/kernels/maximum_minimum_test.cc
index 2c036e369bd..803fe91c460 100644
--- a/tensorflow/lite/kernels/maximum_minimum_test.cc
+++ b/tensorflow/lite/kernels/maximum_minimum_test.cc
@@ -190,6 +190,17 @@ TEST(MaximumOpTest, Int32WithBroadcastTest_ScalarY) {
                      data1, data2, {1, 0, -1, -2, 2, 2}, /*is_constant=*/true);
 }
 
+TEST(MaximumOpTest, Int8WithBroadcastTest_ScalarY) {
+  std::initializer_list<int8_t> data1 = {1, 0, -1, -2, 3, 11};
+  std::initializer_list<int8_t> data2 = {2};
+  TestModel<int8_t>(BuiltinOperator_MAXIMUM, {TensorType_INT8, {3, 1, 2}},
+                    {TensorType_INT8, {}}, {TensorType_INT8, {3, 1, 2}}, data1,
+                    data2, {2, 2, 2, 2, 3, 11}, /*is_constant=*/true);
+  TestModel<int8_t>(BuiltinOperator_MINIMUM, {TensorType_INT8, {3, 1, 2}},
+                    {TensorType_INT8, {}}, {TensorType_INT8, {3, 1, 2}}, data1,
+                    data2, {1, 0, -1, -2, 2, 2}, /*is_constant=*/true);
+}
+
 TEST(MaxMinOpTest, Int8Test8D) {
   std::initializer_list<int8_t> data1 = {1, 0, 2, 11, 2, 23};
   std::initializer_list<int8_t> data2 = {0, 0, 1, 12, 123, 1};
diff --git a/tensorflow/lite/kernels/op_macros.h b/tensorflow/lite/kernels/op_macros.h
index 5c190f1c595..5786756f408 100644
--- a/tensorflow/lite/kernels/op_macros.h
+++ b/tensorflow/lite/kernels/op_macros.h
@@ -44,6 +44,15 @@ inline void InfiniteLoop() {
     fprintf(stderr, "%s", (x)); \
   } while (0)
 
+// Report Error for unsupported type by op 'op_name' and returns kTfLiteError.
+#define TF_LITE_UNSUPPORTED_TYPE(context, type, op_name)                    \
+  do {                                                                      \
+    TF_LITE_KERNEL_LOG((context), "%s:%d Type %s is unsupported by op %s.", \
+                       __FILE__, __LINE__, TfLiteTypeGetName(type),         \
+                       (op_name));                                          \
+    return kTfLiteError;                                                    \
+  } while (0)
+
 #define TFLITE_ABORT abort()
 
 #endif  // TF_LITE_MCU_DEBUG_LOG
diff --git a/tensorflow/lite/kernels/optional_tensor_test.cc b/tensorflow/lite/kernels/optional_tensor_test.cc
index 26d619276aa..9e83c74da8d 100644
--- a/tensorflow/lite/kernels/optional_tensor_test.cc
+++ b/tensorflow/lite/kernels/optional_tensor_test.cc
@@ -78,7 +78,7 @@ class LSTMOpModel : public SingleOpModel {
       input_gate_bias_ = AddInput(TensorType_FLOAT32);
     }
     forget_gate_bias_ = AddInput(TensorType_FLOAT32);
-    cell_bias_ = AddInput(TensorType_FLOAT32);
+    cell_gate_bias_ = AddInput(TensorType_FLOAT32);
     output_gate_bias_ = AddInput(TensorType_FLOAT32);
 
     if (use_projection_weights) {
@@ -161,7 +161,7 @@ class LSTMOpModel : public SingleOpModel {
   }
 
   void SetCellBias(std::initializer_list<float> f) {
-    PopulateTensor(cell_bias_, f);
+    PopulateTensor(cell_gate_bias_, f);
   }
 
   void SetOutputGateBias(std::initializer_list<float> f) {
@@ -209,7 +209,7 @@ class LSTMOpModel : public SingleOpModel {
 
   int input_gate_bias_;
   int forget_gate_bias_;
-  int cell_bias_;
+  int cell_gate_bias_;
   int output_gate_bias_;
 
   int projection_weights_;
@@ -256,7 +256,7 @@ TEST(LSTMOpTest, BlackBoxTestWithCifgWithPeepholeNoProjectionNoClipping) {
 
                        {0},       // input_gate_bias tensor
                        {n_cell},  // forget_gate_bias tensor
-                       {n_cell},  // cell_bias tensor
+                       {n_cell},  // cell_gate_bias tensor
                        {n_cell},  // output_gate_bias tensor
 
                        {0, 0},  // projection_weight tensor
diff --git a/tensorflow/lite/kernels/register.cc b/tensorflow/lite/kernels/register.cc
index 90688a2aa1f..452ce35ec78 100644
--- a/tensorflow/lite/kernels/register.cc
+++ b/tensorflow/lite/kernels/register.cc
@@ -54,24 +54,24 @@ BuiltinOpResolver::BuiltinOpResolver() {
   AddBuiltin(BuiltinOperator_L2_POOL_2D, Register_L2_POOL_2D());
   AddBuiltin(BuiltinOperator_CONV_2D, Register_CONV_2D(),
              /* min_version = */ 1,
-             /* max_version = */ 4);
+             /* max_version = */ 5);
   AddBuiltin(BuiltinOperator_DEPTHWISE_CONV_2D, Register_DEPTHWISE_CONV_2D(),
              /* min_version = */ 1,
-             /* max_version = */ 5);
+             /* max_version = */ 6);
   AddBuiltin(BuiltinOperator_SVDF, Register_SVDF(),
              /* min_version = */ 1,
-             /* max_version = */ 3);
+             /* max_version = */ 4);
   AddBuiltin(BuiltinOperator_RNN, Register_RNN(),
              /* min_version = */ 1,
-             /* max_version = */ 2);
+             /* max_version = */ 3);
   AddBuiltin(BuiltinOperator_BIDIRECTIONAL_SEQUENCE_RNN,
              Register_BIDIRECTIONAL_SEQUENCE_RNN(),
              /* min_version = */ 1,
-             /* max_version = */ 2);
+             /* max_version = */ 3);
   AddBuiltin(BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_RNN,
              Register_UNIDIRECTIONAL_SEQUENCE_RNN(),
              /* min_version = */ 1,
-             /* max_version = */ 2);
+             /* max_version = */ 3);
   AddBuiltin(BuiltinOperator_EMBEDDING_LOOKUP, Register_EMBEDDING_LOOKUP(),
              /* min_version = */ 1,
              /* max_version = */ 3);
@@ -79,7 +79,7 @@ BuiltinOpResolver::BuiltinOpResolver() {
              Register_EMBEDDING_LOOKUP_SPARSE());
   AddBuiltin(BuiltinOperator_FULLY_CONNECTED, Register_FULLY_CONNECTED(),
              /* min_version = */ 1,
-             /* max_version = */ 8);
+             /* max_version = */ 9);
   AddBuiltin(BuiltinOperator_LSH_PROJECTION, Register_LSH_PROJECTION());
   AddBuiltin(BuiltinOperator_HASHTABLE_LOOKUP, Register_HASHTABLE_LOOKUP());
   AddBuiltin(BuiltinOperator_SOFTMAX, Register_SOFTMAX(),
@@ -105,13 +105,13 @@ BuiltinOpResolver::BuiltinOpResolver() {
   AddBuiltin(BuiltinOperator_LOCAL_RESPONSE_NORMALIZATION,
              Register_LOCAL_RESPONSE_NORMALIZATION());
   AddBuiltin(BuiltinOperator_LSTM, Register_LSTM(), /* min_version = */ 1,
-             /* max_version = */ 3);
+             /* max_version = */ 4);
   AddBuiltin(BuiltinOperator_BIDIRECTIONAL_SEQUENCE_LSTM,
              Register_BIDIRECTIONAL_SEQUENCE_LSTM(), /* min_version = */ 1,
              /* max_version = */ 3);
   AddBuiltin(BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_LSTM,
              Register_UNIDIRECTIONAL_SEQUENCE_LSTM(), /* min_version = */ 1,
-             /* max_version = */ 2);
+             /* max_version = */ 3);
   AddBuiltin(BuiltinOperator_PAD, Register_PAD(), /* min_version = */ 1,
              /* max_version = */ 2);
   AddBuiltin(BuiltinOperator_PADV2, Register_PADV2(), /* min_version = */ 1,
@@ -289,7 +289,9 @@ BuiltinOpResolver::BuiltinOpResolver() {
   AddBuiltin(BuiltinOperator_SCATTER_ND, Register_SCATTER_ND());
   AddBuiltin(BuiltinOperator_DENSIFY, Register_DENSIFY());
   AddBuiltin(BuiltinOperator_SEGMENT_SUM, Register_SEGMENT_SUM());
-  AddBuiltin(BuiltinOperator_BATCH_MATMUL, Register_BATCH_MATMUL());
+  AddBuiltin(BuiltinOperator_BATCH_MATMUL, Register_BATCH_MATMUL(),
+             /* min_version = */ 1,
+             /* max_version = */ 2);
   AddCustom("NumericVerify", tflite::ops::custom::Register_NUMERIC_VERIFY());
   // TODO(andrewharp, ahentz): Move these somewhere more appropriate so that
   // custom ops aren't always included by default.
diff --git a/tensorflow/lite/kernels/unidirectional_sequence_lstm.cc b/tensorflow/lite/kernels/unidirectional_sequence_lstm.cc
index 95864196f18..0b2cba72369 100644
--- a/tensorflow/lite/kernels/unidirectional_sequence_lstm.cc
+++ b/tensorflow/lite/kernels/unidirectional_sequence_lstm.cc
@@ -179,10 +179,10 @@ TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
   TF_LITE_ENSURE_EQ(context, forget_gate_bias->dims->size, 1);
   TF_LITE_ENSURE_EQ(context, forget_gate_bias->dims->data[0], n_cell);
 
-  const TfLiteTensor* cell_bias =
+  const TfLiteTensor* cell_gate_bias =
       GetInput(context, node, lstm::full::kCellGateBiasTensor);
-  TF_LITE_ENSURE_EQ(context, cell_bias->dims->size, 1);
-  TF_LITE_ENSURE_EQ(context, cell_bias->dims->data[0], n_cell);
+  TF_LITE_ENSURE_EQ(context, cell_gate_bias->dims->size, 1);
+  TF_LITE_ENSURE_EQ(context, cell_gate_bias->dims->data[0], n_cell);
 
   const TfLiteTensor* output_gate_bias =
       GetInput(context, node, lstm::full::kOutputGateBiasTensor);
@@ -317,20 +317,20 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
                     CheckInputTensorDimensions(context, node, n_input, n_output,
                                                n_cell, is_layer_norm_lstm));
 
-  // Get the pointer to output, activation_state and cell_state buffer tensors.
+  // Get the pointer to output, output_state and cell_state buffer tensors.
   TfLiteTensor* output = GetOutput(context, node, lstm::full::kOutputTensor);
 
-  TfLiteTensor* activation_state =
-      GetVariableInput(context, node, lstm::full::kInputActivationStateTensor);
-  TF_LITE_ENSURE(context, activation_state != nullptr);
+  TfLiteTensor* output_state =
+      GetVariableInput(context, node, lstm::full::kOutputStateTensor);
+  TF_LITE_ENSURE(context, output_state != nullptr);
   TfLiteTensor* cell_state =
-      GetVariableInput(context, node, lstm::full::kInputCellStateTensor);
+      GetVariableInput(context, node, lstm::full::kCellStateTensor);
   TF_LITE_ENSURE(context, cell_state != nullptr);
 
   // Check the shape of input state tensors.
   // These tensor may be 1D or 2D. It's fine as long as the total size is
   // correct.
-  TF_LITE_ENSURE_EQ(context, NumElements(activation_state), n_batch * n_output);
+  TF_LITE_ENSURE_EQ(context, NumElements(output_state), n_batch * n_output);
   TF_LITE_ENSURE_EQ(context, NumElements(cell_state), n_batch * n_cell);
 
   // Resize the output tensors.
@@ -370,7 +370,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   if (IsHybridOp(input, input_to_output_weights)) {
     op_data->compute_row_sums = true;
     // Allocate temporary tensors to store quantized values of input,
-    // activation_state and cell_state tensors.
+    // output_state and cell_state tensors.
     node->temporaries->data[kInputQuantized] =
         scratch_tensor_index + kInputQuantized;
     TfLiteTensor* input_quantized =
@@ -384,17 +384,17 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     }
     node->temporaries->data[kOutputStateQuantized] =
         scratch_tensor_index + kOutputStateQuantized;
-    TfLiteTensor* activation_state_quantized =
+    TfLiteTensor* output_state_quantized =
         GetTemporary(context, node, kOutputStateQuantized);
-    activation_state_quantized->type = input_to_output_weights->type;
-    activation_state_quantized->allocation_type = kTfLiteArenaRw;
-    if (!TfLiteIntArrayEqual(activation_state_quantized->dims,
-                             activation_state->dims)) {
-      TfLiteIntArray* activation_state_quantized_size =
-          TfLiteIntArrayCopy(activation_state->dims);
-      TF_LITE_ENSURE_OK(
-          context, context->ResizeTensor(context, activation_state_quantized,
-                                         activation_state_quantized_size));
+    output_state_quantized->type = input_to_output_weights->type;
+    output_state_quantized->allocation_type = kTfLiteArenaRw;
+    if (!TfLiteIntArrayEqual(output_state_quantized->dims,
+                             output_state->dims)) {
+      TfLiteIntArray* output_state_quantized_size =
+          TfLiteIntArrayCopy(output_state->dims);
+      TF_LITE_ENSURE_OK(context,
+                        context->ResizeTensor(context, output_state_quantized,
+                                              output_state_quantized_size));
     }
     node->temporaries->data[kCellStateQuantized] =
         scratch_tensor_index + kCellStateQuantized;
@@ -546,7 +546,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       GetOptionalInputTensor(context, node, lstm::full::kInputGateBiasTensor);
   const TfLiteTensor* forget_gate_bias =
       GetInput(context, node, lstm::full::kForgetGateBiasTensor);
-  const TfLiteTensor* cell_bias =
+  const TfLiteTensor* cell_gate_bias =
       GetInput(context, node, lstm::full::kCellGateBiasTensor);
   const TfLiteTensor* output_gate_bias =
       GetInput(context, node, lstm::full::kOutputGateBiasTensor);
@@ -559,11 +559,11 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   // Index the scratch buffers pointers to the global scratch buffer.
   TfLiteTensor* scratch_buffer = GetTemporary(context, node, /*index=*/0);
 
-  TfLiteTensor* activation_state =
-      GetVariableInput(context, node, lstm::full::kInputActivationStateTensor);
-  TF_LITE_ENSURE(context, activation_state != nullptr);
+  TfLiteTensor* output_state =
+      GetVariableInput(context, node, lstm::full::kOutputStateTensor);
+  TF_LITE_ENSURE(context, output_state != nullptr);
   TfLiteTensor* cell_state =
-      GetVariableInput(context, node, lstm::full::kInputCellStateTensor);
+      GetVariableInput(context, node, lstm::full::kCellStateTensor);
   TF_LITE_ENSURE(context, cell_state != nullptr);
 
   const TfLiteTensor* input_layer_norm_coefficients =
@@ -611,16 +611,17 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
           /*aux_input_to_forget_weights=*/nullptr,
           /*aux_input_to_cell_weights=*/nullptr,
           /*aux_input_to_output_weights=*/nullptr, input_gate_bias,
-          forget_gate_bias, cell_bias, output_gate_bias, projection_weights,
-          projection_bias, &lstm_params, /*forward_sequence=*/true, time_major,
-          /*output_offset=*/0, scratch_buffer, activation_state, cell_state,
+          forget_gate_bias, cell_gate_bias, output_gate_bias,
+          projection_weights, projection_bias, &lstm_params,
+          /*forward_sequence=*/true, time_major,
+          /*output_offset=*/0, scratch_buffer, output_state, cell_state,
           output);
     }
     case kTfLiteUInt8:
     case kTfLiteInt8: {
       OpData* op_data = reinterpret_cast<OpData*>(node->user_data);
       TfLiteTensor* input_quantized = GetTemporary(context, node, /*index=*/1);
-      TfLiteTensor* activation_state_quantized =
+      TfLiteTensor* output_state_quantized =
           GetTemporary(context, node, /*index=*/2);
       TfLiteTensor* cell_state_quantized =
           GetTemporary(context, node, /*index=*/3);
@@ -648,14 +649,14 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
           /*aux_input_to_forget_weights=*/nullptr,
           /*aux_input_to_cell_weights=*/nullptr,
           /*aux_input_to_output_weights=*/nullptr, input_gate_bias,
-          forget_gate_bias, cell_bias, output_gate_bias, projection_weights,
-          projection_bias, &lstm_params, /*forward_sequence=*/true, time_major,
+          forget_gate_bias, cell_gate_bias, output_gate_bias,
+          projection_weights, projection_bias, &lstm_params,
+          /*forward_sequence=*/true, time_major,
           /*output_offset=*/0, scratch_buffer, scaling_factors,
           prod_scaling_factors, recovered_cell_weights, input_quantized,
-          /*aux_input_quantized=*/nullptr, activation_state_quantized,
-          cell_state_quantized, activation_state, cell_state, accum_scratch,
-          output, zero_points, row_sums, row_sums_size,
-          &op_data->compute_row_sums,
+          /*aux_input_quantized=*/nullptr, output_state_quantized,
+          cell_state_quantized, output_state, cell_state, accum_scratch, output,
+          zero_points, row_sums, row_sums_size, &op_data->compute_row_sums,
           CpuBackendContext::GetFromContext(context));
     }
     default:
diff --git a/tensorflow/lite/kernels/unidirectional_sequence_lstm_test.cc b/tensorflow/lite/kernels/unidirectional_sequence_lstm_test.cc
index 43cc75f894b..74584ec9e85 100644
--- a/tensorflow/lite/kernels/unidirectional_sequence_lstm_test.cc
+++ b/tensorflow/lite/kernels/unidirectional_sequence_lstm_test.cc
@@ -85,7 +85,7 @@ class UnidirectionalLSTMOpModel : public SingleOpModel {
       input_gate_bias_ = AddInput(TensorType_FLOAT32);
     }
     forget_gate_bias_ = AddInput(TensorType_FLOAT32);
-    cell_bias_ = AddInput(TensorType_FLOAT32);
+    cell_gate_bias_ = AddInput(TensorType_FLOAT32);
     output_gate_bias_ = AddInput(TensorType_FLOAT32);
 
     if (use_projection_weights) {
@@ -100,13 +100,12 @@ class UnidirectionalLSTMOpModel : public SingleOpModel {
       projection_bias_ = AddNullInput();
     }
 
-    // Adding the 2 input state tensors.
-    input_activation_state_ =
+    // Adding the 2 state tensors.
+    output_state_ =
         AddInput(TensorData{TensorType_FLOAT32, {n_output_ * n_batch_}},
                  /*is_variable=*/true);
-    input_cell_state_ =
-        AddInput(TensorData{TensorType_FLOAT32, {n_cell_ * n_batch_}},
-                 /*is_variable=*/true);
+    cell_state_ = AddInput(TensorData{TensorType_FLOAT32, {n_cell_ * n_batch_}},
+                           /*is_variable=*/true);
 
     // Layer norm weights.
     if (is_layer_norm) {
@@ -188,7 +187,7 @@ class UnidirectionalLSTMOpModel : public SingleOpModel {
   }
 
   void SetCellBias(const std::vector<float>& f) {
-    PopulateTensor(cell_bias_, f);
+    PopulateTensor(cell_gate_bias_, f);
   }
 
   void SetOutputGateBias(const std::vector<float>& f) {
@@ -250,14 +249,14 @@ class UnidirectionalLSTMOpModel : public SingleOpModel {
 
   int input_gate_bias_;
   int forget_gate_bias_;
-  int cell_bias_;
+  int cell_gate_bias_;
   int output_gate_bias_;
 
   int projection_weights_;
   int projection_bias_;
 
-  int input_activation_state_;
-  int input_cell_state_;
+  int output_state_;
+  int cell_state_;
 
   int input_layer_norm_coefficients_;
   int forget_layer_norm_coefficients_;
@@ -531,13 +530,13 @@ TEST_F(NoCifgNoPeepholeNoProjectionNoClippingUnidirectionalLstmTest,
 
           {n_cell},  // input_gate_bias tensor
           {n_cell},  // forget_gate_bias tensor
-          {n_cell},  // cell_bias tensor
+          {n_cell},  // cell_gate_bias tensor
           {n_cell},  // output_gate_bias tensor
 
           {0, 0},  // projection_weight tensor
           {0},     // projection_bias tensor
 
-          {n_batch, n_output},  // activation_state tensor
+          {n_batch, n_output},  // output_state tensor
           {n_batch, n_cell},    // cell_state tensor
       });
 
@@ -593,13 +592,13 @@ TEST_F(NoCifgNoPeepholeNoProjectionNoClippingUnidirectionalLstmTest,
 
           {n_cell},  // input_gate_bias tensor
           {n_cell},  // forget_gate_bias tensor
-          {n_cell},  // cell_bias tensor
+          {n_cell},  // cell_gate_bias tensor
           {n_cell},  // output_gate_bias tensor
 
           {0, 0},  // projection_weight tensor
           {0},     // projection_bias tensor
 
-          {n_batch, n_output},  // activation_state tensor
+          {n_batch, n_output},  // output_state tensor
           {n_batch, n_cell},    // cell_state tensor
       });
 
@@ -659,13 +658,13 @@ TEST_P(NoCifgNoPeepholeNoProjectionNoClippingUnidirectionalLstmTest,
 
           {n_cell},  // input_gate_bias tensor
           {n_cell},  // forget_gate_bias tensor
-          {n_cell},  // cell_bias tensor
+          {n_cell},  // cell_gate_bias tensor
           {n_cell},  // output_gate_bias tensor
 
           {0, 0},  // projection_weight tensor
           {0},     // projection_bias tensor
 
-          {n_batch, n_output},  // activation_state tensor
+          {n_batch, n_output},  // output_state tensor
           {n_batch, n_cell},    // cell_state tensor
       },
       TensorType_UINT8, GetParam());
@@ -722,13 +721,13 @@ TEST_P(NoCifgNoPeepholeNoProjectionNoClippingUnidirectionalLstmTest,
 
           {n_cell},  // input_gate_bias tensor
           {n_cell},  // forget_gate_bias tensor
-          {n_cell},  // cell_bias tensor
+          {n_cell},  // cell_gate_bias tensor
           {n_cell},  // output_gate_bias tensor
 
           {0, 0},  // projection_weight tensor
           {0},     // projection_bias tensor
 
-          {n_batch, n_output},  // activation_state tensor
+          {n_batch, n_output},  // output_state tensor
           {n_batch, n_cell},    // cell_state tensor
       },
       TensorType_INT8, GetParam());
@@ -834,13 +833,13 @@ TEST_F(CifgPeepholeNoProjectionNoClippingUnidirectionalLstmTest,
 
           {0},       // input_gate_bias tensor
           {n_cell},  // forget_gate_bias tensor
-          {n_cell},  // cell_bias tensor
+          {n_cell},  // cell_gate_bias tensor
           {n_cell},  // output_gate_bias tensor
 
           {0, 0},  // projection_weight tensor
           {0},     // projection_bias tensor
 
-          {n_batch, n_output},  // activation_state tensor
+          {n_batch, n_output},  // output_state tensor
           {n_batch, n_cell},    // cell_state tensor
       });
 
@@ -895,13 +894,13 @@ TEST_P(CifgPeepholeNoProjectionNoClippingUnidirectionalLstmTest,
 
           {0},       // input_gate_bias tensor
           {n_cell},  // forget_gate_bias tensor
-          {n_cell},  // cell_bias tensor
+          {n_cell},  // cell_gate_bias tensor
           {n_cell},  // output_gate_bias tensor
 
           {0, 0},  // projection_weight tensor
           {0},     // projection_bias tensor
 
-          {n_batch, n_output},  // activation_state tensor
+          {n_batch, n_output},  // output_state tensor
           {n_batch, n_cell},    // cell_state tensor
       },
       TensorType_UINT8, GetParam());
@@ -958,13 +957,13 @@ TEST_P(CifgPeepholeNoProjectionNoClippingUnidirectionalLstmTest,
 
           {0},       // input_gate_bias tensor
           {n_cell},  // forget_gate_bias tensor
-          {n_cell},  // cell_bias tensor
+          {n_cell},  // cell_gate_bias tensor
           {n_cell},  // output_gate_bias tensor
 
           {0, 0},  // projection_weight tensor
           {0},     // projection_bias tensor
 
-          {n_batch, n_output},  // activation_state tensor
+          {n_batch, n_output},  // output_state tensor
           {n_batch, n_cell},    // cell_state tensor
       },
       TensorType_INT8, GetParam());
@@ -1620,13 +1619,13 @@ TEST_F(NoCifgPeepholeProjectionClippingUnidirectionalLstmTest,
 
           {n_cell},  // input_gate_bias tensor
           {n_cell},  // forget_gate_bias tensor
-          {n_cell},  // cell_bias tensor
+          {n_cell},  // cell_gate_bias tensor
           {n_cell},  // output_gate_bias tensor
 
           {n_output, n_cell},  // projection_weight tensor
           {0},                 // projection_bias tensor
 
-          {n_batch, n_output},  // activation_state tensor
+          {n_batch, n_output},  // output_state tensor
           {n_batch, n_cell},    // cell_state tensor
       });
 
@@ -1689,13 +1688,13 @@ TEST_P(NoCifgPeepholeProjectionClippingUnidirectionalLstmTest,
 
           {n_cell},  // input_gate_bias tensor
           {n_cell},  // forget_gate_bias tensor
-          {n_cell},  // cell_bias tensor
+          {n_cell},  // cell_gate_bias tensor
           {n_cell},  // output_gate_bias tensor
 
           {n_output, n_cell},  // projection_weight tensor
           {0},                 // projection_bias tensor
 
-          {n_batch, n_output},  // activation_state tensor
+          {n_batch, n_output},  // output_state tensor
           {n_batch, n_cell},    // cell_state tensor
       },
       TensorType_UINT8, GetParam());
@@ -1760,13 +1759,13 @@ TEST_P(NoCifgPeepholeProjectionClippingUnidirectionalLstmTest,
 
           {n_cell},  // input_gate_bias tensor
           {n_cell},  // forget_gate_bias tensor
-          {n_cell},  // cell_bias tensor
+          {n_cell},  // cell_gate_bias tensor
           {n_cell},  // output_gate_bias tensor
 
           {n_output, n_cell},  // projection_weight tensor
           {0},                 // projection_bias tensor
 
-          {n_batch, n_output},  // activation_state tensor
+          {n_batch, n_output},  // output_state tensor
           {n_batch, n_cell},    // cell_state tensor
       },
       TensorType_INT8, GetParam());
@@ -2431,13 +2430,13 @@ TEST_F(NoCifgPeepholeProjectionAndBiasClippingUnidirectionalLstmTest,
 
           {n_cell},  // input_gate_bias tensor
           {n_cell},  // forget_gate_bias tensor
-          {n_cell},  // cell_bias tensor
+          {n_cell},  // cell_gate_bias tensor
           {n_cell},  // output_gate_bias tensor
 
           {n_output, n_cell},  // projection_weight tensor
           {n_output},          // projection_bias tensor
 
-          {n_batch, n_output},  // activation_state tensor
+          {n_batch, n_output},  // output_state tensor
           {n_batch, n_cell},    // cell_state tensor
       });
 
@@ -2637,13 +2636,13 @@ TEST_F(CifgPeepholeNoProjectionNoClippingLayerNormUnidirectionalLstmTest,
 
           {0},       // input_gate_bias tensor
           {n_cell},  // forget_gate_bias tensor
-          {n_cell},  // cell_bias tensor
+          {n_cell},  // cell_gate_bias tensor
           {n_cell},  // output_gate_bias tensor
 
           {0, 0},  // projection_weight tensor
           {0},     // projection_bias tensor
 
-          {n_batch, n_output},  // activation_state tensor
+          {n_batch, n_output},  // output_state tensor
           {n_batch, n_cell},    // cell_state tensor
 
           {0},       // input_layer_norm_coefficient tensor
@@ -2708,13 +2707,13 @@ TEST_F(CifgPeepholeNoProjectionNoClippingUnidirectionalLstmTest,
 
           {0},       // input_gate_bias tensor
           {n_cell},  // forget_gate_bias tensor
-          {n_cell},  // cell_bias tensor
+          {n_cell},  // cell_gate_bias tensor
           {n_cell},  // output_gate_bias tensor
 
           {0, 0},  // projection_weight tensor
           {0},     // projection_bias tensor
 
-          {n_batch, n_output},  // activation_state tensor
+          {n_batch, n_output},  // output_state tensor
           {n_batch, n_cell},    // cell_state tensor
 
           {0},  // input_layer_norm_coefficient tensor
diff --git a/tensorflow/lite/lib_package/create_ios_frameworks.sh b/tensorflow/lite/lib_package/create_ios_frameworks.sh
deleted file mode 100755
index 71aee65848f..00000000000
--- a/tensorflow/lite/lib_package/create_ios_frameworks.sh
+++ /dev/null
@@ -1,120 +0,0 @@
-#!/bin/bash
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-# TODO(ycling): Refactoring - Move this script into `tools/make`.
-set -e
-
-echo "Starting"
-TFLITE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/.."
-
-usage() {
-  echo "Usage: $(basename "$0") [-a]"
-  echo "-g build with GPU delegate"
-  exit 1
-}
-
-USE_GPU_DELEGATE="false"
-FRAMEWORK_NAME="tensorflow_lite"
-while getopts "g" opt_name; do
-  case "$opt_name" in
-    g)
-        USE_GPU_DELEGATE="true"
-        FRAMEWORK_NAME="tensorflow_lite_gpu"
-        ;;
-    *) usage;;
-  esac
-done
-shift $((OPTIND - 1))
-readonly USE_GPU_DELEGATE
-readonly FRAMEWORK_NAME
-
-if [ $USE_GPU_DELEGATE == "true" ] ; then
-  for filename in metal_delegate.h libmetal_delegate.a ; do
-    if [[ ! -f "${TFLITE_DIR}/delegates/gpu/${filename}" ]] ; then
-      echo "File ${TFLITE_DIR}/delegates/gpu/${filename} doesn't exist."
-      echo "It's required for building TFLite Framework with GPU. Aborting."
-      exit 1
-    fi
-  done
-fi
-
-TMP_DIR=$(mktemp -d)
-echo "Package dir: " $TMP_DIR
-FW_DIR=$TMP_DIR/tensorflow_lite_ios_frameworks
-FW_DIR_TFLITE=$FW_DIR/$FRAMEWORK_NAME.framework
-FW_DIR_TFLITE_HDRS=$FW_DIR_TFLITE/Headers
-
-echo "Creating target Headers directories"
-mkdir -p $FW_DIR_TFLITE_HDRS
-
-echo "Headers, populating: TensorFlow Lite"
-cd $TFLITE_DIR/../..
-
-find tensorflow/lite -name '*.h' \
-    -not -path 'tensorflow/lite/tools/*' \
-    -not -path 'tensorflow/lite/examples/*' \
-    -not -path 'tensorflow/lite/gen/*' \
-    -not -path 'tensorflow/lite/toco/*' \
-    -not -path 'tensorflow/lite/nnapi/*' \
-    -not -path 'tensorflow/lite/java/*' \
-    | tar -cf $FW_DIR_TFLITE_HDRS/tmp.tar -T -
-cd $FW_DIR_TFLITE_HDRS
-tar xf tmp.tar
-rm -f tmp.tar
-
-echo "Headers, populating: Flatbuffer"
-cd $TFLITE_DIR/tools/make/downloads/flatbuffers/include/
-find . -name '*.h' | tar -cf $FW_DIR_TFLITE_HDRS/tmp.tar -T -
-cd $FW_DIR_TFLITE_HDRS
-tar xf tmp.tar
-rm -f tmp.tar
-
-cd $TFLITE_DIR/../..
-echo "Generate LICENSE files and copy to target"
-bazel build //tensorflow/tools/lib_package:clicenses_generate
-cp $TFLITE_DIR/../../LICENSE $FW_DIR_TFLITE
-cp $TFLITE_DIR/../../bazel-bin/tensorflow/tools/lib_package/THIRD_PARTY_TF_C_LICENSES \
-   $FW_DIR_TFLITE
-
-echo "Copying static libraries"
-# Note: There must be a static library with the same name
-# as the framework name.
-cp $TFLITE_DIR/tools/make/gen/lib/libtensorflow-lite.a \
-    $FW_DIR_TFLITE/$FRAMEWORK_NAME
-if [ $USE_GPU_DELEGATE == "true" ] ; then
-  cp "${TFLITE_DIR}/delegates/gpu/libmetal_delegate.a" \
-      $FW_DIR_TFLITE/libmetal_delegate.a
-fi
-
-# This is required, otherwise they interfere with the documentation of the
-# pod at cocoapods.org.
-echo "Remove all README files"
-cd $FW_DIR_TFLITE_HDRS
-find . -type f -name README\* -exec rm -f {} \;
-find . -type f -name readme\* -exec rm -f {} \;
-
-TARGET_GEN_LOCATION="$TFLITE_DIR/gen/ios_frameworks"
-echo "Moving results to target: " $TARGET_GEN_LOCATION
-cd $FW_DIR
-zip -q -r $FRAMEWORK_NAME.framework.zip $FRAMEWORK_NAME.framework -x .DS_Store
-rm -rf $TARGET_GEN_LOCATION
-mkdir -p $TARGET_GEN_LOCATION
-cp -r $FRAMEWORK_NAME.framework.zip $TARGET_GEN_LOCATION
-
-echo "Cleaning up"
-rm -rf $TMP_DIR
-
-echo "Finished"
diff --git a/tensorflow/lite/micro/BUILD b/tensorflow/lite/micro/BUILD
index 32d7271734e..bdfa0c909db 100644
--- a/tensorflow/lite/micro/BUILD
+++ b/tensorflow/lite/micro/BUILD
@@ -102,30 +102,6 @@ cc_library(
     ],
 )
 
-# TODO(b/144176795): This target should really be handled differently so that we
-# do not have a fork in the build graph. The bug has some initial ideas.
-cc_library(
-    name = "portable_optimized_op_resolver",
-    srcs = [
-        "all_ops_resolver.cc",
-        "micro_mutable_op_resolver.h",
-        "micro_op_resolver.h",
-    ],
-    hdrs = [
-        "all_ops_resolver.h",
-    ],
-    copts = micro_copts(),
-    deps = [
-        ":micro_compatibility",
-        "//tensorflow/lite/c:common",
-        "//tensorflow/lite/core/api",
-        "//tensorflow/lite/kernels:op_macros",
-        "//tensorflow/lite/kernels/internal:compatibility",
-        "//tensorflow/lite/micro/kernels:portable_optimized_micro_ops",
-        "//tensorflow/lite/schema:schema_fbs",
-    ],
-)
-
 cc_library(
     name = "debug_log",
     srcs = [
@@ -268,6 +244,7 @@ tflite_micro_cc_test(
         ":micro_framework",
         ":micro_utils",
         ":op_resolvers",
+        ":recording_allocators",
         ":test_helpers",
         "//tensorflow/lite/core/api",
         "//tensorflow/lite/micro/testing:micro_test",
diff --git a/tensorflow/lite/micro/all_ops_resolver.cc b/tensorflow/lite/micro/all_ops_resolver.cc
index b0021a2e771..e728a95360a 100644
--- a/tensorflow/lite/micro/all_ops_resolver.cc
+++ b/tensorflow/lite/micro/all_ops_resolver.cc
@@ -26,74 +26,60 @@ const char* GetString_ETHOSU();
 
 AllOpsResolver::AllOpsResolver() {
   // Please keep this list of Builtin Operators in alphabetical order.
-  AddBuiltin(BuiltinOperator_ABS, tflite::ops::micro::Register_ABS());
-  AddBuiltin(BuiltinOperator_ADD, tflite::ops::micro::Register_ADD());
-  AddBuiltin(BuiltinOperator_ARG_MAX, tflite::ops::micro::Register_ARG_MAX());
-  AddBuiltin(BuiltinOperator_ARG_MIN, tflite::ops::micro::Register_ARG_MIN());
-  AddBuiltin(BuiltinOperator_AVERAGE_POOL_2D,
-             tflite::ops::micro::Register_AVERAGE_POOL_2D());
-  AddBuiltin(BuiltinOperator_CEIL, tflite::ops::micro::Register_CEIL());
-  AddBuiltin(BuiltinOperator_CONCATENATION,
-             tflite::ops::micro::Register_CONCATENATION());
-  AddBuiltin(BuiltinOperator_CONV_2D, tflite::ops::micro::Register_CONV_2D());
-  AddBuiltin(BuiltinOperator_COS, tflite::ops::micro::Register_COS());
-  AddBuiltin(BuiltinOperator_DEPTHWISE_CONV_2D,
-             tflite::ops::micro::Register_DEPTHWISE_CONV_2D());
-  AddBuiltin(BuiltinOperator_DEQUANTIZE,
-             tflite::ops::micro::Register_DEQUANTIZE());
-  AddBuiltin(BuiltinOperator_EQUAL, tflite::ops::micro::Register_EQUAL());
-  AddBuiltin(BuiltinOperator_FLOOR, tflite::ops::micro::Register_FLOOR());
-  AddBuiltin(BuiltinOperator_FULLY_CONNECTED,
-             tflite::ops::micro::Register_FULLY_CONNECTED());
-  AddBuiltin(BuiltinOperator_GREATER, tflite::ops::micro::Register_GREATER());
-  AddBuiltin(BuiltinOperator_GREATER_EQUAL,
-             tflite::ops::micro::Register_GREATER_EQUAL());
-  AddBuiltin(BuiltinOperator_L2_NORMALIZATION,
-             tflite::ops::micro::Register_L2_NORMALIZATION());
-  AddBuiltin(BuiltinOperator_LESS, tflite::ops::micro::Register_LESS());
-  AddBuiltin(BuiltinOperator_LESS_EQUAL,
-             tflite::ops::micro::Register_LESS_EQUAL());
-  AddBuiltin(BuiltinOperator_LOG, tflite::ops::micro::Register_LOG());
-  AddBuiltin(BuiltinOperator_LOGICAL_AND,
-             tflite::ops::micro::Register_LOGICAL_AND());
-  AddBuiltin(BuiltinOperator_LOGICAL_NOT,
-             tflite::ops::micro::Register_LOGICAL_NOT());
-  AddBuiltin(BuiltinOperator_LOGICAL_OR,
-             tflite::ops::micro::Register_LOGICAL_OR());
-  AddBuiltin(BuiltinOperator_LOGISTIC, tflite::ops::micro::Register_LOGISTIC());
-  AddBuiltin(BuiltinOperator_MAX_POOL_2D,
-             tflite::ops::micro::Register_MAX_POOL_2D());
-  AddBuiltin(BuiltinOperator_MAXIMUM, tflite::ops::micro::Register_MAXIMUM());
-  AddBuiltin(BuiltinOperator_MEAN, tflite::ops::micro::Register_MEAN());
-  AddBuiltin(BuiltinOperator_MINIMUM, tflite::ops::micro::Register_MINIMUM());
-  AddBuiltin(BuiltinOperator_MUL, tflite::ops::micro::Register_MUL());
-  AddBuiltin(BuiltinOperator_NEG, tflite::ops::micro::Register_NEG());
-  AddBuiltin(BuiltinOperator_NOT_EQUAL,
-             tflite::ops::micro::Register_NOT_EQUAL());
-  AddBuiltin(BuiltinOperator_PACK, tflite::ops::micro::Register_PACK());
-  AddBuiltin(BuiltinOperator_PAD, tflite::ops::micro::Register_PAD());
-  AddBuiltin(BuiltinOperator_PADV2, tflite::ops::micro::Register_PADV2());
-  AddBuiltin(BuiltinOperator_PRELU, tflite::ops::micro::Register_PRELU());
-  AddBuiltin(BuiltinOperator_QUANTIZE, tflite::ops::micro::Register_QUANTIZE());
-  AddBuiltin(BuiltinOperator_RELU, tflite::ops::micro::Register_RELU());
-  AddBuiltin(BuiltinOperator_RELU6, tflite::ops::micro::Register_RELU6());
-  AddBuiltin(BuiltinOperator_RESHAPE, tflite::ops::micro::Register_RESHAPE());
-  AddBuiltin(BuiltinOperator_RESIZE_NEAREST_NEIGHBOR,
-             tflite::ops::micro::Register_RESIZE_NEAREST_NEIGHBOR());
-  AddBuiltin(BuiltinOperator_ROUND, tflite::ops::micro::Register_ROUND());
-  AddBuiltin(BuiltinOperator_RSQRT, tflite::ops::micro::Register_RSQRT());
-  AddBuiltin(BuiltinOperator_SIN, tflite::ops::micro::Register_SIN());
-  AddBuiltin(BuiltinOperator_SOFTMAX, tflite::ops::micro::Register_SOFTMAX());
-  AddBuiltin(BuiltinOperator_SPLIT, tflite::ops::micro::Register_SPLIT());
-  AddBuiltin(BuiltinOperator_SQRT, tflite::ops::micro::Register_SQRT());
-  AddBuiltin(BuiltinOperator_SQUARE, tflite::ops::micro::Register_SQUARE());
-  AddBuiltin(BuiltinOperator_STRIDED_SLICE,
-             tflite::ops::micro::Register_STRIDED_SLICE());
-  AddBuiltin(BuiltinOperator_SUB, tflite::ops::micro::Register_SUB());
-  AddBuiltin(BuiltinOperator_SVDF, tflite::ops::micro::Register_SVDF());
-  AddBuiltin(BuiltinOperator_TANH, tflite::ops::micro::Register_TANH());
-  AddBuiltin(BuiltinOperator_UNPACK, tflite::ops::micro::Register_UNPACK());
+  AddAbs();
+  AddAdd();
+  AddArgMax();
+  AddArgMin();
+  AddAveragePool2D();
+  AddCeil();
+  AddConcatenation();
+  AddConv2D();
+  AddCos();
+  AddDepthwiseConv2D();
+  AddDequantize();
+  AddEqual();
+  AddFloor();
+  AddFullyConnected();
+  AddGreater();
+  AddGreaterEqual();
+  AddL2Normalization();
+  AddLess();
+  AddLessEqual();
+  AddLog();
+  AddLogicalAnd();
+  AddLogicalNot();
+  AddLogicalOr();
+  AddLogistic();
+  AddMaximum();
+  AddMaxPool2D();
+  AddMean();
+  AddMinimum();
+  AddMul();
+  AddNeg();
+  AddNotEqual();
+  AddPack();
+  AddPad();
+  AddPadV2();
+  AddPrelu();
+  AddQuantize();
+  AddRelu();
+  AddRelu6();
+  AddReshape();
+  AddResizeNearestNeighbor();
+  AddRound();
+  AddRsqrt();
+  AddSin();
+  AddSoftmax();
+  AddSplit();
+  AddSqrt();
+  AddSquare();
+  AddStridedSlice();
+  AddSub();
+  AddSvdf();
+  AddTanh();
+  AddUnpack();
 
+  // TODO(b/159644355): Figure out if custom Ops belong in AllOpsResolver.
   TfLiteRegistration* registration =
       tflite::ops::micro::custom::Register_ETHOSU();
   if (registration) {
diff --git a/tensorflow/lite/micro/examples/hello_world/README.md b/tensorflow/lite/micro/examples/hello_world/README.md
index 3b633890306..0966a1fd1b1 100644
--- a/tensorflow/lite/micro/examples/hello_world/README.md
+++ b/tensorflow/lite/micro/examples/hello_world/README.md
@@ -17,6 +17,7 @@ of the device.
 -   [Deploy to ARC EM SDP](#deploy-to-arc-em-sdp)
 -   [Deploy to Arduino](#deploy-to-arduino)
 -   [Deploy to ESP32](#deploy-to-esp32)
+-   [Deploy to Himax WE1 EVB](#deploy-to-himax-we1-evb)
 -   [Deploy to SparkFun Edge](#deploy-to-sparkfun-edge)
 -   [Deploy to STM32F746](#deploy-to-STM32F746)
 -   [Run the tests on a development machine](#run-the-tests-on-a-development-machine)
@@ -191,6 +192,105 @@ The previous two commands can be combined:
 idf.py --port /dev/ttyUSB0 flash monitor
 ```
 
+## Deploy to Himax WE1 EVB
+
+The following instructions will help you build and deploy this example to
+[HIMAX WE1 EVB](https://github.com/HimaxWiseEyePlus/bsp_tflu/tree/master/HIMAX_WE1_EVB_board_brief)
+board. To undstand more about using this board, please check
+[HIMAX WE1 EVB user guide](https://github.com/HimaxWiseEyePlus/bsp_tflu/tree/master/HIMAX_WE1_EVB_user_guide).
+
+### Initial Setup
+
+To use the HIMAX WE1 EVB, please make sure following software are installed:
+
+#### MetaWare Development Toolkit
+
+See
+[Install the Synopsys DesignWare ARC MetaWare Development Toolkit](/tensorflow/lite/micro/tools/make/targets/arc/README.md#install-the-synopsys-designware-arc-metaware-development-toolkit)
+section for instructions on toolchain installation.
+
+#### Make Tool version
+
+A `'make'` tool is required for deploying Tensorflow Lite Micro applications on
+HIMAX WE1 EVB, See
+[Check make tool version](/tensorflow/lite/micro/tools/make/targets/arc/README.md#make-tool)
+section for proper environment.
+
+#### Serial Terminal Emulation Application
+
+There are 2 main purposes for HIMAX WE1 EVB Debug UART port
+
+-   print application output
+-   burn application to flash by using xmodem send application binary
+
+You can use any terminal emulation program (like [PuTTY](https://www.putty.org/)
+or [minicom](https://linux.die.net/man/1/minicom)).
+
+### Generate Example Project
+
+The example project for HIMAX WE1 EVB platform can be generated with the
+following command:
+
+Download related third party data
+
+```
+make -f tensorflow/lite/micro/tools/make/Makefile TARGET=himax_we1_evb third_party_downloads
+```
+
+Generate hello world project
+
+```
+make -f tensorflow/lite/micro/tools/make/Makefile generate_hello_world_make_project TARGET=himax_we1_evb TAGS=no_arc_mli
+```
+
+### Build and Burn Example
+
+Following the Steps to run hello world example at HIMAX WE1 EVB platform.
+
+1.  Go to the generated example project directory.
+
+    ```
+    cd tensorflow/lite/micro/tools/make/gen/himax_we1_evb_arc/prj/hello_world/make
+    ```
+
+2.  Build the example using
+
+    ```
+    make app
+    ```
+
+3.  After example build finish, copy ELF file and map file to image generate
+    tool directory. \
+    image generate tool directory located at
+    `'tensorflow/lite/micro/tools/make/downloads/himax_we1_sdk/image_gen_linux_v3/'`
+
+    ```
+    cp hello_world.elf himax_we1_evb.map ../../../../../downloads/himax_we1_sdk/image_gen_linux_v3/
+    ```
+
+4.  Go to flash image generate tool directory.
+
+    ```
+    cd ../../../../../downloads/himax_we1_sdk/image_gen_linux_v3/
+    ```
+
+5.  run image generate tool, generate flash image file.
+
+    *   Before running image generate tool, by typing `sudo chmod +x image_gen`
+        and `sudo chmod +x sign_tool` to make sure it is executable.
+
+    ```
+    image_gen -e hello_world.elf -m himax_we1_evb.map -o out.img
+    ```
+
+6.  Download flash image file to HIMAX WE1 EVB by UART:
+
+    *   more detail about download image through UART can be found at
+        [HIMAX WE1 EVB update Flash image](https://github.com/HimaxWiseEyePlus/bsp_tflu/tree/master/HIMAX_WE1_EVB_user_guide#flash-image-update)
+
+After these steps, press reset button on the HIMAX WE1 EVB, you will see
+application output in the serial terminal.
+
 ## Deploy to SparkFun Edge
 
 The following instructions will help you build and deploy this sample on the
diff --git a/tensorflow/lite/micro/examples/image_recognition_experimental/image_recognition_test.cc b/tensorflow/lite/micro/examples/image_recognition_experimental/image_recognition_test.cc
index ac4de118834..5ad2fb2acbe 100644
--- a/tensorflow/lite/micro/examples/image_recognition_experimental/image_recognition_test.cc
+++ b/tensorflow/lite/micro/examples/image_recognition_experimental/image_recognition_test.cc
@@ -44,14 +44,10 @@ TF_LITE_MICRO_TEST(TestImageRecognitionInvoke) {
 
   tflite::MicroMutableOpResolver<4> micro_op_resolver;
 
-  micro_op_resolver.AddBuiltin(tflite::BuiltinOperator_CONV_2D,
-                               tflite::ops::micro::Register_CONV_2D());
-  micro_op_resolver.AddBuiltin(tflite::BuiltinOperator_MAX_POOL_2D,
-                               tflite::ops::micro::Register_MAX_POOL_2D());
-  micro_op_resolver.AddBuiltin(tflite::BuiltinOperator_FULLY_CONNECTED,
-                               tflite::ops::micro::Register_FULLY_CONNECTED());
-  micro_op_resolver.AddBuiltin(tflite::BuiltinOperator_SOFTMAX,
-                               tflite::ops::micro::Register_SOFTMAX());
+  micro_op_resolver.AddConv2D();
+  micro_op_resolver.AddMaxPool2D();
+  micro_op_resolver.AddFullyConnected();
+  micro_op_resolver.AddSoftmax();
 
   const int tensor_arena_size = 50 * 1024;
   uint8_t tensor_arena[tensor_arena_size];
diff --git a/tensorflow/lite/micro/examples/image_recognition_experimental/main.cc b/tensorflow/lite/micro/examples/image_recognition_experimental/main.cc
index becdbdf1bd7..fcf7b41b827 100644
--- a/tensorflow/lite/micro/examples/image_recognition_experimental/main.cc
+++ b/tensorflow/lite/micro/examples/image_recognition_experimental/main.cc
@@ -58,14 +58,10 @@ int main(int argc, char** argv) {
 
   tflite::MicroMutableOpResolver<4> micro_op_resolver;
 
-  micro_op_resolver.AddBuiltin(tflite::BuiltinOperator_CONV_2D,
-                               tflite::ops::micro::Register_CONV_2D());
-  micro_op_resolver.AddBuiltin(tflite::BuiltinOperator_MAX_POOL_2D,
-                               tflite::ops::micro::Register_MAX_POOL_2D());
-  micro_op_resolver.AddBuiltin(tflite::BuiltinOperator_FULLY_CONNECTED,
-                               tflite::ops::micro::Register_FULLY_CONNECTED());
-  micro_op_resolver.AddBuiltin(tflite::BuiltinOperator_SOFTMAX,
-                               tflite::ops::micro::Register_SOFTMAX());
+  micro_op_resolver.AddConv2D();
+  micro_op_resolver.AddFullyConnected();
+  micro_op_resolver.AddMaxPool2D();
+  micro_op_resolver.AddSoftmax();
 
   constexpr int tensor_arena_size = 50 * 1024;
   uint8_t tensor_arena[tensor_arena_size];
diff --git a/tensorflow/lite/micro/examples/magic_wand/magic_wand_test.cc b/tensorflow/lite/micro/examples/magic_wand/magic_wand_test.cc
index 88bfad860e2..fb75afee309 100644
--- a/tensorflow/lite/micro/examples/magic_wand/magic_wand_test.cc
+++ b/tensorflow/lite/micro/examples/magic_wand/magic_wand_test.cc
@@ -47,17 +47,11 @@ TF_LITE_MICRO_TEST(LoadModelAndPerformInference) {
   // incur some penalty in code space for op implementations that are not
   // needed by this graph.
   static tflite::MicroMutableOpResolver<5> micro_op_resolver;  // NOLINT
-  micro_op_resolver.AddBuiltin(
-      tflite::BuiltinOperator_DEPTHWISE_CONV_2D,
-      tflite::ops::micro::Register_DEPTHWISE_CONV_2D());
-  micro_op_resolver.AddBuiltin(tflite::BuiltinOperator_MAX_POOL_2D,
-                               tflite::ops::micro::Register_MAX_POOL_2D());
-  micro_op_resolver.AddBuiltin(tflite::BuiltinOperator_CONV_2D,
-                               tflite::ops::micro::Register_CONV_2D());
-  micro_op_resolver.AddBuiltin(tflite::BuiltinOperator_FULLY_CONNECTED,
-                               tflite::ops::micro::Register_FULLY_CONNECTED());
-  micro_op_resolver.AddBuiltin(tflite::BuiltinOperator_SOFTMAX,
-                               tflite::ops::micro::Register_SOFTMAX());
+  micro_op_resolver.AddConv2D();
+  micro_op_resolver.AddDepthwiseConv2D();
+  micro_op_resolver.AddFullyConnected();
+  micro_op_resolver.AddMaxPool2D();
+  micro_op_resolver.AddSoftmax();
 
   // Create an area of memory to use for input, output, and intermediate arrays.
   // Finding the minimum value for your model may require some trial and error.
diff --git a/tensorflow/lite/micro/examples/magic_wand/main_functions.cc b/tensorflow/lite/micro/examples/magic_wand/main_functions.cc
index 26c2eb44747..8defeaad866 100644
--- a/tensorflow/lite/micro/examples/magic_wand/main_functions.cc
+++ b/tensorflow/lite/micro/examples/magic_wand/main_functions.cc
@@ -66,17 +66,11 @@ void setup() {
   // incur some penalty in code space for op implementations that are not
   // needed by this graph.
   static tflite::MicroMutableOpResolver<5> micro_op_resolver;  // NOLINT
-  micro_op_resolver.AddBuiltin(
-      tflite::BuiltinOperator_DEPTHWISE_CONV_2D,
-      tflite::ops::micro::Register_DEPTHWISE_CONV_2D());
-  micro_op_resolver.AddBuiltin(tflite::BuiltinOperator_MAX_POOL_2D,
-                               tflite::ops::micro::Register_MAX_POOL_2D());
-  micro_op_resolver.AddBuiltin(tflite::BuiltinOperator_CONV_2D,
-                               tflite::ops::micro::Register_CONV_2D());
-  micro_op_resolver.AddBuiltin(tflite::BuiltinOperator_FULLY_CONNECTED,
-                               tflite::ops::micro::Register_FULLY_CONNECTED());
-  micro_op_resolver.AddBuiltin(tflite::BuiltinOperator_SOFTMAX,
-                               tflite::ops::micro::Register_SOFTMAX());
+  micro_op_resolver.AddConv2D();
+  micro_op_resolver.AddDepthwiseConv2D();
+  micro_op_resolver.AddFullyConnected();
+  micro_op_resolver.AddMaxPool2D();
+  micro_op_resolver.AddSoftmax();
 
   // Build an interpreter to run the model with.
   static tflite::MicroInterpreter static_interpreter(
diff --git a/tensorflow/lite/micro/examples/micro_speech/apollo3evb/audio_provider.cc b/tensorflow/lite/micro/examples/micro_speech/apollo3evb/audio_provider.cc
index 3cabf724b22..7d6d268bcc6 100644
--- a/tensorflow/lite/micro/examples/micro_speech/apollo3evb/audio_provider.cc
+++ b/tensorflow/lite/micro/examples/micro_speech/apollo3evb/audio_provider.cc
@@ -251,7 +251,6 @@ void pdm_start_dma(tflite::ErrorReporter* error_reporter) {
 
   // Reset the PDM DMA flags.
   g_pdm_dma_error = false;
-  g_pdm_dma_error_reporter = error_reporter;
 }
 
 #if USE_MAYA
@@ -470,11 +469,12 @@ TfLiteStatus InitAudioRecording(tflite::ErrorReporter* error_reporter) {
 #endif  // USE_TIME_STAMP
 
   // Configure, turn on PDM
+  g_pdm_dma_error_reporter = error_reporter;
   pdm_init();
   am_hal_interrupt_master_enable();
   am_hal_pdm_fifo_flush(g_pdm_handle);
   // Trigger the PDM DMA for the first time manually.
-  pdm_start_dma(g_pdm_dma_error_reporter);
+  pdm_start_dma(error_reporter);
 
   TF_LITE_REPORT_ERROR(error_reporter, "\nPDM DMA Threshold = %d",
                        PDMn(0)->FIFOTHR);
diff --git a/tensorflow/lite/micro/examples/micro_speech/main_functions.cc b/tensorflow/lite/micro/examples/micro_speech/main_functions.cc
index 30c5022b2d6..d09c4c7af06 100644
--- a/tensorflow/lite/micro/examples/micro_speech/main_functions.cc
+++ b/tensorflow/lite/micro/examples/micro_speech/main_functions.cc
@@ -75,24 +75,16 @@ void setup() {
   // tflite::AllOpsResolver resolver;
   // NOLINTNEXTLINE(runtime-global-variables)
   static tflite::MicroMutableOpResolver<4> micro_op_resolver(error_reporter);
-  if (micro_op_resolver.AddBuiltin(
-          tflite::BuiltinOperator_DEPTHWISE_CONV_2D,
-          tflite::ops::micro::Register_DEPTHWISE_CONV_2D()) != kTfLiteOk) {
+  if (micro_op_resolver.AddDepthwiseConv2D() != kTfLiteOk) {
     return;
   }
-  if (micro_op_resolver.AddBuiltin(
-          tflite::BuiltinOperator_FULLY_CONNECTED,
-          tflite::ops::micro::Register_FULLY_CONNECTED()) != kTfLiteOk) {
+  if (micro_op_resolver.AddFullyConnected() != kTfLiteOk) {
     return;
   }
-  if (micro_op_resolver.AddBuiltin(tflite::BuiltinOperator_SOFTMAX,
-                                   tflite::ops::micro::Register_SOFTMAX()) !=
-      kTfLiteOk) {
+  if (micro_op_resolver.AddSoftmax() != kTfLiteOk) {
     return;
   }
-  if (micro_op_resolver.AddBuiltin(tflite::BuiltinOperator_RESHAPE,
-                                   tflite::ops::micro::Register_RESHAPE()) !=
-      kTfLiteOk) {
+  if (micro_op_resolver.AddReshape() != kTfLiteOk) {
     return;
   }
 
diff --git a/tensorflow/lite/micro/examples/micro_speech/micro_speech_test.cc b/tensorflow/lite/micro/examples/micro_speech/micro_speech_test.cc
index 2c442f955cc..0f6a2afd527 100644
--- a/tensorflow/lite/micro/examples/micro_speech/micro_speech_test.cc
+++ b/tensorflow/lite/micro/examples/micro_speech/micro_speech_test.cc
@@ -49,15 +49,10 @@ TF_LITE_MICRO_TEST(TestInvoke) {
   //
   // tflite::AllOpsResolver resolver;
   tflite::MicroMutableOpResolver<4> micro_op_resolver;
-  micro_op_resolver.AddBuiltin(
-      tflite::BuiltinOperator_DEPTHWISE_CONV_2D,
-      tflite::ops::micro::Register_DEPTHWISE_CONV_2D());
-  micro_op_resolver.AddBuiltin(tflite::BuiltinOperator_FULLY_CONNECTED,
-                               tflite::ops::micro::Register_FULLY_CONNECTED());
-  micro_op_resolver.AddBuiltin(tflite::BuiltinOperator_SOFTMAX,
-                               tflite::ops::micro::Register_SOFTMAX());
-  micro_op_resolver.AddBuiltin(tflite::BuiltinOperator_RESHAPE,
-                               tflite::ops::micro::Register_RESHAPE());
+  micro_op_resolver.AddDepthwiseConv2D();
+  micro_op_resolver.AddFullyConnected();
+  micro_op_resolver.AddReshape();
+  micro_op_resolver.AddSoftmax();
 
   // Create an area of memory to use for input, output, and intermediate arrays.
   const int tensor_arena_size = 10 * 1024;
diff --git a/tensorflow/lite/micro/examples/micro_speech/train/train_micro_speech_model.ipynb b/tensorflow/lite/micro/examples/micro_speech/train/train_micro_speech_model.ipynb
index bfe75bdd9f7..1a1d5ff2c3b 100644
--- a/tensorflow/lite/micro/examples/micro_speech/train/train_micro_speech_model.ipynb
+++ b/tensorflow/lite/micro/examples/micro_speech/train/train_micro_speech_model.ipynb
@@ -1 +1,599 @@
-{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"name":"train_micro_speech_model.ipynb","provenance":[{"file_id":"https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/examples/micro_speech/train/train_micro_speech_model.ipynb","timestamp":1587690382292}],"collapsed_sections":[],"toc_visible":true},"kernelspec":{"name":"python3","display_name":"Python 3"},"accelerator":"GPU"},"cells":[{"cell_type":"markdown","metadata":{"id":"pO4-CY_TCZZS","colab_type":"text"},"source":["# Train a Simple Audio Recognition Model"]},{"cell_type":"markdown","metadata":{"id":"BaFfr7DHRmGF","colab_type":"text"},"source":["This notebook demonstrates how to train a 20 kB [Simple Audio Recognition](https://www.tensorflow.org/tutorials/sequences/audio_recognition) model to recognize keywords in speech.\n","\n","The model created in this notebook is used in the [micro_speech](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/micro/examples/micro_speech) example for [TensorFlow Lite for MicroControllers](https://www.tensorflow.org/lite/microcontrollers/overview).\n","\n","<table class=\"tfo-notebook-buttons\" align=\"left\">\n","  <td>\n","    <a target=\"_blank\" href=\"https://colab.research.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/examples/micro_speech/train/train_micro_speech_model.ipynb\"><img src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" />Run in Google Colab</a>\n","  </td>\n","  <td>\n","    <a target=\"_blank\" href=\"https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/examples/micro_speech/train/train_micro_speech_model.ipynb\"><img src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" />View source on GitHub</a>\n","  </td>\n","</table>\n"]},{"cell_type":"markdown","metadata":{"id":"XaVtYN4nlCft","colab_type":"text"},"source":["**Training is much faster using GPU acceleration.** Before you proceed, ensure you are using a GPU runtime by going to **Runtime -> Change runtime type** and set **Hardware accelerator: GPU**. Training 15,000 iterations will take 1.5 - 2 hours on a GPU runtime.\n","\n","## Configure Defaults\n","\n","**MODIFY** the following constants for your specific use case."]},{"cell_type":"code","metadata":{"id":"ludfxbNIaegy","colab_type":"code","colab":{}},"source":["# A comma-delimited list of the words you want to train for.\n","# The options are: yes,no,up,down,left,right,on,off,stop,go\n","# All the other words will be used to train an \"unknown\" label and silent\n","# audio data with no spoken words will be used to train a \"silence\" label.\n","WANTED_WORDS = \"yes,no\"\n","\n","# The number of steps and learning rates can be specified as comma-separated\n","# lists to define the rate at each stage. For example,\n","# TRAINING_STEPS=12000,3000 and LEARNING_RATE=0.001,0.0001\n","# will run 12,000 training loops in total, with a rate of 0.001 for the first\n","# 8,000, and 0.0001 for the final 3,000.\n","TRAINING_STEPS = \"12000,3000\"\n","LEARNING_RATE = \"0.001,0.0001\"\n","\n","# Calculate the total number of steps, which is used to identify the checkpoint\n","# file name.\n","TOTAL_STEPS = str(sum(map(lambda string: int(string), TRAINING_STEPS.split(\",\"))))\n","\n","# Print the configuration to confirm it\n","print(\"Training these words: %s\" % WANTED_WORDS)\n","print(\"Training steps in each stage: %s\" % TRAINING_STEPS)\n","print(\"Learning rate in each stage: %s\" % LEARNING_RATE)\n","print(\"Total number of training steps: %s\" % TOTAL_STEPS)"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"gCgeOpvY9pAi","colab_type":"text"},"source":["**DO NOT MODIFY** the following constants as they include filepaths used in this notebook and data that is shared during training and inference."]},{"cell_type":"code","metadata":{"id":"Nd1iM1o2ymvA","colab_type":"code","colab":{}},"source":["# Calculate the percentage of 'silence' and 'unknown' training samples required\n","# to ensure that we have equal number of samples for each label.\n","number_of_labels = WANTED_WORDS.count(',') + 1\n","number_of_total_labels = number_of_labels + 2 # for 'silence' and 'unknown' label\n","equal_percentage_of_training_samples = int(100.0/(number_of_total_labels))\n","SILENT_PERCENTAGE = equal_percentage_of_training_samples\n","UNKNOWN_PERCENTAGE = equal_percentage_of_training_samples\n","\n","# Constants which are shared during training and inference\n","PREPROCESS = 'micro'\n","WINDOW_STRIDE = 20\n","MODEL_ARCHITECTURE = 'tiny_conv' # Other options include: single_fc, conv,\n","                      # low_latency_conv, low_latency_svdf, tiny_embedding_conv\n","\n","# Constants used during training only\n","VERBOSITY = 'WARN'\n","EVAL_STEP_INTERVAL = '1000'\n","SAVE_STEP_INTERVAL = '1000'\n","\n","# Constants for training directories and filepaths\n","DATASET_DIR =  'dataset/'\n","LOGS_DIR = 'logs/'\n","TRAIN_DIR = 'train/' # for training checkpoints and other files.\n","\n","# Constants for inference directories and filepaths\n","import os\n","MODELS_DIR = 'models'\n","if not os.path.exists(MODELS_DIR):\n","  os.mkdir(MODELS_DIR)\n","MODEL_TF = os.path.join(MODELS_DIR, 'model.pb')\n","MODEL_TFLITE = os.path.join(MODELS_DIR, 'model.tflite')\n","FLOAT_MODEL_TFLITE = os.path.join(MODELS_DIR, 'float_model.tflite')\n","MODEL_TFLITE_MICRO = os.path.join(MODELS_DIR, 'model.cc')\n","SAVED_MODEL = os.path.join(MODELS_DIR, 'saved_model')\n","\n","QUANT_INPUT_MIN = 0.0\n","QUANT_INPUT_MAX = 26.0\n","QUANT_INPUT_RANGE = QUANT_INPUT_MAX - QUANT_INPUT_MIN"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"6rLYpvtg9P4o","colab_type":"text"},"source":["## Setup Environment\n","\n","Install Dependencies"]},{"cell_type":"code","metadata":{"id":"ed_XpUrU5DvY","colab_type":"code","colab":{}},"source":["%tensorflow_version 1.x\n","import tensorflow as tf"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"T9Ty5mR58E4i","colab_type":"text"},"source":["**DELETE** any old data from previous runs\n"]},{"cell_type":"code","metadata":{"id":"APGx0fEh7hFF","colab_type":"code","colab":{}},"source":["!rm -rf {DATASET_DIR} {LOGS_DIR} {TRAIN_DIR} {MODELS_DIR}"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"GfEUlfFBizio","colab_type":"text"},"source":["Clone the TensorFlow Github Repository, which contains the relevant code required to run this tutorial."]},{"cell_type":"code","metadata":{"id":"yZArmzT85SLq","colab_type":"code","colab":{}},"source":["!git clone -q --depth 1 https://github.com/tensorflow/tensorflow"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"nS9swHLSi7Bi","colab_type":"text"},"source":["Load TensorBoard to visualize the accuracy and loss as training proceeds.\n"]},{"cell_type":"code","metadata":{"id":"q4qF1VxP3UE4","colab_type":"code","colab":{}},"source":["%load_ext tensorboard\n","%tensorboard --logdir {LOGS_DIR}"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"x1J96Ron-O4R","colab_type":"text"},"source":["## Training\n","\n","The following script downloads the dataset and begin training."]},{"cell_type":"code","metadata":{"id":"VJsEZx6lynbY","colab_type":"code","colab":{}},"source":["!python tensorflow/tensorflow/examples/speech_commands/train.py \\\n","--data_dir={DATASET_DIR} \\\n","--wanted_words={WANTED_WORDS} \\\n","--silence_percentage={SILENT_PERCENTAGE} \\\n","--unknown_percentage={UNKNOWN_PERCENTAGE} \\\n","--preprocess={PREPROCESS} \\\n","--window_stride={WINDOW_STRIDE} \\\n","--model_architecture={MODEL_ARCHITECTURE} \\\n","--how_many_training_steps={TRAINING_STEPS} \\\n","--learning_rate={LEARNING_RATE} \\\n","--train_dir={TRAIN_DIR} \\\n","--summaries_dir={LOGS_DIR} \\\n","--verbosity={VERBOSITY} \\\n","--eval_step_interval={EVAL_STEP_INTERVAL} \\\n","--save_step_interval={SAVE_STEP_INTERVAL}"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"UczQKtqLi7OJ","colab_type":"text"},"source":["# Skipping the training\n","\n","If you don't want to spend an hour or two training the model from scratch, you can download pretrained checkpoints by uncommenting the lines below (removing the '#'s at the start of each line) and running them."]},{"cell_type":"code","metadata":{"id":"RZw3VNlnla-J","colab_type":"code","colab":{}},"source":["#!curl -O \"https://storage.googleapis.com/download.tensorflow.org/models/tflite/speech_micro_train_2020_05_10.tgz\"\n","#!tar xzf speech_micro_train_2020_05_10.tgz"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"XQUJLrdS-ftl","colab_type":"text"},"source":["## Generate a TensorFlow Model for Inference\n","\n","Combine relevant training results (graph, weights, etc) into a single file for inference. This process is known as freezing a model and the resulting model is known as a frozen model/graph, as it cannot be further re-trained after this process."]},{"cell_type":"code","metadata":{"id":"xyc3_eLh9sAg","colab_type":"code","colab":{}},"source":["!rm -rf {SAVED_MODEL}\n","!python tensorflow/tensorflow/examples/speech_commands/freeze.py \\\n","--wanted_words=$WANTED_WORDS \\\n","--window_stride_ms=$WINDOW_STRIDE \\\n","--preprocess=$PREPROCESS \\\n","--model_architecture=$MODEL_ARCHITECTURE \\\n","--start_checkpoint=$TRAIN_DIR$MODEL_ARCHITECTURE'.ckpt-'{TOTAL_STEPS} \\\n","--save_format=saved_model \\\n","--output_file={SAVED_MODEL}"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"_DBGDxVI-nKG","colab_type":"text"},"source":["## Generate a TensorFlow Lite Model\n","\n","Convert the frozen graph into a TensorFlow Lite model, which is fully quantized for use with embedded devices.\n","\n","The following cell will also print the model size, which will be under 20 kilobytes."]},{"cell_type":"code","metadata":{"id":"RIitkqvGWmre","colab_type":"code","colab":{}},"source":["import sys\n","# We add this path so we can import the speech processing modules.\n","sys.path.append(\"/content/tensorflow/tensorflow/examples/speech_commands/\")\n","import input_data\n","import models\n","import numpy as np"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"kzqECqMxgBh4","colab_type":"code","colab":{}},"source":["SAMPLE_RATE = 16000\n","CLIP_DURATION_MS = 1000\n","WINDOW_SIZE_MS = 30.0\n","FEATURE_BIN_COUNT = 40\n","BACKGROUND_FREQUENCY = 0.8\n","BACKGROUND_VOLUME_RANGE = 0.1\n","TIME_SHIFT_MS = 100.0\n","\n","DATA_URL = 'https://storage.googleapis.com/download.tensorflow.org/data/speech_commands_v0.02.tar.gz'\n","VALIDATION_PERCENTAGE = 10\n","TESTING_PERCENTAGE = 10"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"rNQdAplJV1fz","colab_type":"code","colab":{}},"source":["model_settings = models.prepare_model_settings(\n","    len(input_data.prepare_words_list(WANTED_WORDS.split(','))),\n","    SAMPLE_RATE, CLIP_DURATION_MS, WINDOW_SIZE_MS,\n","    WINDOW_STRIDE, FEATURE_BIN_COUNT, PREPROCESS)\n","audio_processor = input_data.AudioProcessor(\n","    DATA_URL, DATASET_DIR,\n","    SILENT_PERCENTAGE, UNKNOWN_PERCENTAGE,\n","    WANTED_WORDS.split(','), VALIDATION_PERCENTAGE,\n","    TESTING_PERCENTAGE, model_settings, LOGS_DIR)"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"lBj_AyCh1cC0","colab_type":"code","colab":{}},"source":["with tf.Session() as sess:\n","  float_converter = tf.lite.TFLiteConverter.from_saved_model(SAVED_MODEL)\n","  float_tflite_model = float_converter.convert()\n","  float_tflite_model_size = open(FLOAT_MODEL_TFLITE, \"wb\").write(float_tflite_model)\n","  print(\"Float model is %d bytes\" % float_tflite_model_size)\n","\n","  converter = tf.lite.TFLiteConverter.from_saved_model(SAVED_MODEL)\n","  converter.optimizations = [tf.lite.Optimize.DEFAULT]\n","  converter.inference_input_type = tf.lite.constants.INT8\n","  converter.inference_output_type = tf.lite.constants.INT8\n","  def representative_dataset_gen():\n","    for i in range(100):\n","      data, _ = audio_processor.get_data(1, i*1, model_settings,\n","                                         BACKGROUND_FREQUENCY, \n","                                         BACKGROUND_VOLUME_RANGE,\n","                                         TIME_SHIFT_MS,\n","                                         'testing',\n","                                         sess)\n","      flattened_data = np.array(data.flatten(), dtype=np.float32).reshape(1, 1960)\n","      yield [flattened_data]\n","  converter.representative_dataset = representative_dataset_gen\n","  tflite_model = converter.convert()\n","  tflite_model_size = open(MODEL_TFLITE, \"wb\").write(tflite_model)\n","  print(\"Quantized model is %d bytes\" % tflite_model_size)\n"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"EeLiDZTbLkzv","colab_type":"text"},"source":["# Testing the TensorFlow Lite model's accuracy\n","\n","Verify that the model we've exported is still accurate, using the TF Lite Python API and our test set."]},{"cell_type":"code","metadata":{"id":"wQsEteKRLryJ","colab_type":"code","colab":{}},"source":["with tf.Session() as sess:\n","  test_data, test_labels = audio_processor.get_data(\n","      -1, 0, model_settings, BACKGROUND_FREQUENCY, BACKGROUND_VOLUME_RANGE,\n","      TIME_SHIFT_MS, 'testing', sess)\n","\n","float_interpreter = tf.lite.Interpreter(FLOAT_MODEL_TFLITE)\n","float_interpreter.allocate_tensors()\n","\n","float_input_index = float_interpreter.get_input_details()[0][\"index\"]\n","\n","float_output_index = float_interpreter.get_output_details()[0][\"index\"]\n","float_model_output = float_interpreter.tensor(float_output_index)\n","\n","float_correct_predictions = 0\n","for i in range(len(test_data)):\n","  current_input = test_data[i]\n","  current_label = test_labels[i]\n","  flattened_input = np.array(current_input.flatten(), dtype=np.float32).reshape(1, 1960)\n","  float_interpreter.set_tensor(float_input_index, flattened_input)\n","  float_interpreter.invoke()\n","  top_prediction = float_model_output()[0].argmax()\n","  if top_prediction == current_label:\n","    float_correct_predictions += 1\n","\n","print('Float accuracy is %f%% (N=%d)' % ((float_correct_predictions * 100) / len(test_data), len(test_data)))\n","\n","interpreter = tf.lite.Interpreter(MODEL_TFLITE)\n","interpreter.allocate_tensors()\n","\n","input_index = interpreter.get_input_details()[0][\"index\"]\n","\n","output_index = interpreter.get_output_details()[0][\"index\"]\n","model_output = interpreter.tensor(output_index)\n","\n","with tf.Session() as sess:\n","  test_data, test_labels = audio_processor.get_data(\n","      -1, 0, model_settings, BACKGROUND_FREQUENCY, BACKGROUND_VOLUME_RANGE,\n","      TIME_SHIFT_MS, 'testing', sess)\n","\n","correct_predictions = 0\n","for i in range(len(test_data)):\n","  current_input = test_data[i]\n","  current_label = test_labels[i]\n","  quantized_input = np.zeros((1960), np.int8)\n","  for index, input_value in enumerate(current_input.flatten()):\n","    # These scaling values are derived from those used in input_data.py in the\n","    # training pipeline.\n","    value = ((input_value - QUANT_INPUT_MIN) * 256) / QUANT_INPUT_RANGE\n","    value -= 128\n","    if value < -128:\n","      value = -128\n","    if value > 127:\n","      value = 127\n","    quantized_input[index] = value\n","  flattened_input = np.array(quantized_input.flatten(), dtype=np.int8).reshape(1, 1960)\n","  interpreter.set_tensor(input_index, flattened_input)\n","  interpreter.invoke()\n","  top_prediction = model_output()[0].argmax()\n","  if top_prediction == current_label:\n","    correct_predictions += 1\n","\n","print('Quantized accuracy is %f%% (N=%d)' % ((correct_predictions * 100) / len(test_data), len(test_data)))\n"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"dt6Zqbxu-wIi","colab_type":"text"},"source":["## Generate a TensorFlow Lite for MicroControllers Model\n","Convert the TensorFlow Lite model into a C source file that can be loaded by TensorFlow Lite for Microcontrollers."]},{"cell_type":"code","metadata":{"id":"XohZOTjR8ZyE","colab_type":"code","colab":{}},"source":["# Install xxd if it is not available\n","!apt-get update && apt-get -qq install xxd\n","# Convert to a C source file\n","!xxd -i {MODEL_TFLITE} > {MODEL_TFLITE_MICRO}\n","# Update variable names\n","REPLACE_TEXT = MODEL_TFLITE.replace('/', '_').replace('.', '_')\n","!sed -i 's/'{REPLACE_TEXT}'/g_model/g' {MODEL_TFLITE_MICRO}"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"2pQnN0i_-0L2","colab_type":"text"},"source":["## Deploy to a Microcontroller\n","\n","Follow the instructions in the [micro_speech](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/micro/examples/micro_speech) README.md for [TensorFlow Lite for MicroControllers](https://www.tensorflow.org/lite/microcontrollers/overview) to deploy this model on a specific microcontroller.\n","\n","**Reference Model:** If you have not modified this notebook, you can follow the instructions as is, to deploy the model. Refer to the [`micro_speech/train/models`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/examples/micro_speech/train/models) directory to access the models generated in this notebook. \n","\n","**New Model:** If you have generated a new model to identify different words: (i) Update `kCategoryCount` and `kCategoryLabels` in [`micro_speech/micro_features/micro_model_settings.h`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/examples/micro_speech/micro_features/micro_model_settings.h) and (ii) Update the values assigned to the variables defined in [`micro_speech/micro_features/model.cc`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/examples/micro_speech/micro_features/model.cc) with values displayed after running the following cell."]},{"cell_type":"code","metadata":{"id":"eoYyh0VU8pca","colab_type":"code","colab":{}},"source":["# Print the C source file\n","!cat {MODEL_TFLITE_MICRO}"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"iYlIKpO2mkhv","colab_type":"code","colab":{}},"source":[""],"execution_count":0,"outputs":[]}]}
\ No newline at end of file
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "name": "train_micro_speech_model.ipynb",
+      "provenance": [],
+      "collapsed_sections": [],
+      "toc_visible": true
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    },
+    "accelerator": "GPU"
+  },
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "pO4-CY_TCZZS",
+        "colab_type": "text"
+      },
+      "source": [
+        "# Train a Simple Audio Recognition Model"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "BaFfr7DHRmGF",
+        "colab_type": "text"
+      },
+      "source": [
+        "This notebook demonstrates how to train a 20 kB [Simple Audio Recognition](https://www.tensorflow.org/tutorials/sequences/audio_recognition) model to recognize keywords in speech.\n",
+        "\n",
+        "The model created in this notebook is used in the [micro_speech](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/micro/examples/micro_speech) example for [TensorFlow Lite for MicroControllers](https://www.tensorflow.org/lite/microcontrollers/overview).\n",
+        "\n",
+        "<table class=\"tfo-notebook-buttons\" align=\"left\">\n",
+        "  <td>\n",
+        "    <a target=\"_blank\" href=\"https://colab.research.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/examples/micro_speech/train/train_micro_speech_model.ipynb\"><img src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" />Run in Google Colab</a>\n",
+        "  </td>\n",
+        "  <td>\n",
+        "    <a target=\"_blank\" href=\"https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/examples/micro_speech/train/train_micro_speech_model.ipynb\"><img src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" />View source on GitHub</a>\n",
+        "  </td>\n",
+        "</table>\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "XaVtYN4nlCft",
+        "colab_type": "text"
+      },
+      "source": [
+        "**Training is much faster using GPU acceleration.** Before you proceed, ensure you are using a GPU runtime by going to **Runtime -> Change runtime type** and set **Hardware accelerator: GPU**. Training 15,000 iterations will take 1.5 - 2 hours on a GPU runtime.\n",
+        "\n",
+        "## Configure Defaults\n",
+        "\n",
+        "**MODIFY** the following constants for your specific use case."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "ludfxbNIaegy",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "# A comma-delimited list of the words you want to train for.\n",
+        "# The options are: yes,no,up,down,left,right,on,off,stop,go\n",
+        "# All the other words will be used to train an \"unknown\" label and silent\n",
+        "# audio data with no spoken words will be used to train a \"silence\" label.\n",
+        "WANTED_WORDS = \"yes,no\"\n",
+        "\n",
+        "# The number of steps and learning rates can be specified as comma-separated\n",
+        "# lists to define the rate at each stage. For example,\n",
+        "# TRAINING_STEPS=12000,3000 and LEARNING_RATE=0.001,0.0001\n",
+        "# will run 12,000 training loops in total, with a rate of 0.001 for the first\n",
+        "# 8,000, and 0.0001 for the final 3,000.\n",
+        "TRAINING_STEPS = \"12000,3000\"\n",
+        "LEARNING_RATE = \"0.001,0.0001\"\n",
+        "\n",
+        "# Calculate the total number of steps, which is used to identify the checkpoint\n",
+        "# file name.\n",
+        "TOTAL_STEPS = str(sum(map(lambda string: int(string), TRAINING_STEPS.split(\",\"))))\n",
+        "\n",
+        "# Print the configuration to confirm it\n",
+        "print(\"Training these words: %s\" % WANTED_WORDS)\n",
+        "print(\"Training steps in each stage: %s\" % TRAINING_STEPS)\n",
+        "print(\"Learning rate in each stage: %s\" % LEARNING_RATE)\n",
+        "print(\"Total number of training steps: %s\" % TOTAL_STEPS)"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "gCgeOpvY9pAi",
+        "colab_type": "text"
+      },
+      "source": [
+        "**DO NOT MODIFY** the following constants as they include filepaths used in this notebook and data that is shared during training and inference."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "Nd1iM1o2ymvA",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "# Calculate the percentage of 'silence' and 'unknown' training samples required\n",
+        "# to ensure that we have equal number of samples for each label.\n",
+        "number_of_labels = WANTED_WORDS.count(',') + 1\n",
+        "number_of_total_labels = number_of_labels + 2 # for 'silence' and 'unknown' label\n",
+        "equal_percentage_of_training_samples = int(100.0/(number_of_total_labels))\n",
+        "SILENT_PERCENTAGE = equal_percentage_of_training_samples\n",
+        "UNKNOWN_PERCENTAGE = equal_percentage_of_training_samples\n",
+        "\n",
+        "# Constants which are shared during training and inference\n",
+        "PREPROCESS = 'micro'\n",
+        "WINDOW_STRIDE = 20\n",
+        "MODEL_ARCHITECTURE = 'tiny_conv' # Other options include: single_fc, conv,\n",
+        "                      # low_latency_conv, low_latency_svdf, tiny_embedding_conv\n",
+        "\n",
+        "# Constants used during training only\n",
+        "VERBOSITY = 'WARN'\n",
+        "EVAL_STEP_INTERVAL = '1000'\n",
+        "SAVE_STEP_INTERVAL = '1000'\n",
+        "\n",
+        "# Constants for training directories and filepaths\n",
+        "DATASET_DIR =  'dataset/'\n",
+        "LOGS_DIR = 'logs/'\n",
+        "TRAIN_DIR = 'train/' # for training checkpoints and other files.\n",
+        "\n",
+        "# Constants for inference directories and filepaths\n",
+        "import os\n",
+        "MODELS_DIR = 'models'\n",
+        "if not os.path.exists(MODELS_DIR):\n",
+        "  os.mkdir(MODELS_DIR)\n",
+        "MODEL_TF = os.path.join(MODELS_DIR, 'model.pb')\n",
+        "MODEL_TFLITE = os.path.join(MODELS_DIR, 'model.tflite')\n",
+        "FLOAT_MODEL_TFLITE = os.path.join(MODELS_DIR, 'float_model.tflite')\n",
+        "MODEL_TFLITE_MICRO = os.path.join(MODELS_DIR, 'model.cc')\n",
+        "SAVED_MODEL = os.path.join(MODELS_DIR, 'saved_model')\n",
+        "\n",
+        "QUANT_INPUT_MIN = 0.0\n",
+        "QUANT_INPUT_MAX = 26.0\n",
+        "QUANT_INPUT_RANGE = QUANT_INPUT_MAX - QUANT_INPUT_MIN"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "6rLYpvtg9P4o",
+        "colab_type": "text"
+      },
+      "source": [
+        "## Setup Environment\n",
+        "\n",
+        "Install Dependencies"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "ed_XpUrU5DvY",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "%tensorflow_version 1.x\n",
+        "import tensorflow as tf"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "T9Ty5mR58E4i",
+        "colab_type": "text"
+      },
+      "source": [
+        "**DELETE** any old data from previous runs\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "APGx0fEh7hFF",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "!rm -rf {DATASET_DIR} {LOGS_DIR} {TRAIN_DIR} {MODELS_DIR}"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "GfEUlfFBizio",
+        "colab_type": "text"
+      },
+      "source": [
+        "Clone the TensorFlow Github Repository, which contains the relevant code required to run this tutorial."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "yZArmzT85SLq",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "!git clone -q --depth 1 https://github.com/tensorflow/tensorflow"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "nS9swHLSi7Bi",
+        "colab_type": "text"
+      },
+      "source": [
+        "Load TensorBoard to visualize the accuracy and loss as training proceeds.\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "q4qF1VxP3UE4",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "%load_ext tensorboard\n",
+        "%tensorboard --logdir {LOGS_DIR}"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "x1J96Ron-O4R",
+        "colab_type": "text"
+      },
+      "source": [
+        "## Training\n",
+        "\n",
+        "The following script downloads the dataset and begin training."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "VJsEZx6lynbY",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "!python tensorflow/tensorflow/examples/speech_commands/train.py \\\n",
+        "--data_dir={DATASET_DIR} \\\n",
+        "--wanted_words={WANTED_WORDS} \\\n",
+        "--silence_percentage={SILENT_PERCENTAGE} \\\n",
+        "--unknown_percentage={UNKNOWN_PERCENTAGE} \\\n",
+        "--preprocess={PREPROCESS} \\\n",
+        "--window_stride={WINDOW_STRIDE} \\\n",
+        "--model_architecture={MODEL_ARCHITECTURE} \\\n",
+        "--how_many_training_steps={TRAINING_STEPS} \\\n",
+        "--learning_rate={LEARNING_RATE} \\\n",
+        "--train_dir={TRAIN_DIR} \\\n",
+        "--summaries_dir={LOGS_DIR} \\\n",
+        "--verbosity={VERBOSITY} \\\n",
+        "--eval_step_interval={EVAL_STEP_INTERVAL} \\\n",
+        "--save_step_interval={SAVE_STEP_INTERVAL}"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "UczQKtqLi7OJ",
+        "colab_type": "text"
+      },
+      "source": [
+        "# Skipping the training\n",
+        "\n",
+        "If you don't want to spend an hour or two training the model from scratch, you can download pretrained checkpoints by uncommenting the lines below (removing the '#'s at the start of each line) and running them."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "RZw3VNlnla-J",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "#!curl -O \"https://storage.googleapis.com/download.tensorflow.org/models/tflite/speech_micro_train_2020_05_10.tgz\"\n",
+        "#!tar xzf speech_micro_train_2020_05_10.tgz"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "XQUJLrdS-ftl",
+        "colab_type": "text"
+      },
+      "source": [
+        "## Generate a TensorFlow Model for Inference\n",
+        "\n",
+        "Combine relevant training results (graph, weights, etc) into a single file for inference. This process is known as freezing a model and the resulting model is known as a frozen model/graph, as it cannot be further re-trained after this process."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "xyc3_eLh9sAg",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "!rm -rf {SAVED_MODEL}\n",
+        "!python tensorflow/tensorflow/examples/speech_commands/freeze.py \\\n",
+        "--wanted_words=$WANTED_WORDS \\\n",
+        "--window_stride_ms=$WINDOW_STRIDE \\\n",
+        "--preprocess=$PREPROCESS \\\n",
+        "--model_architecture=$MODEL_ARCHITECTURE \\\n",
+        "--start_checkpoint=$TRAIN_DIR$MODEL_ARCHITECTURE'.ckpt-'{TOTAL_STEPS} \\\n",
+        "--save_format=saved_model \\\n",
+        "--output_file={SAVED_MODEL}"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "_DBGDxVI-nKG",
+        "colab_type": "text"
+      },
+      "source": [
+        "## Generate a TensorFlow Lite Model\n",
+        "\n",
+        "Convert the frozen graph into a TensorFlow Lite model, which is fully quantized for use with embedded devices.\n",
+        "\n",
+        "The following cell will also print the model size, which will be under 20 kilobytes."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "RIitkqvGWmre",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "import sys\n",
+        "# We add this path so we can import the speech processing modules.\n",
+        "sys.path.append(\"/content/tensorflow/tensorflow/examples/speech_commands/\")\n",
+        "import input_data\n",
+        "import models\n",
+        "import numpy as np"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "kzqECqMxgBh4",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "SAMPLE_RATE = 16000\n",
+        "CLIP_DURATION_MS = 1000\n",
+        "WINDOW_SIZE_MS = 30.0\n",
+        "FEATURE_BIN_COUNT = 40\n",
+        "BACKGROUND_FREQUENCY = 0.8\n",
+        "BACKGROUND_VOLUME_RANGE = 0.1\n",
+        "TIME_SHIFT_MS = 100.0\n",
+        "\n",
+        "DATA_URL = 'https://storage.googleapis.com/download.tensorflow.org/data/speech_commands_v0.02.tar.gz'\n",
+        "VALIDATION_PERCENTAGE = 10\n",
+        "TESTING_PERCENTAGE = 10"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "rNQdAplJV1fz",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "model_settings = models.prepare_model_settings(\n",
+        "    len(input_data.prepare_words_list(WANTED_WORDS.split(','))),\n",
+        "    SAMPLE_RATE, CLIP_DURATION_MS, WINDOW_SIZE_MS,\n",
+        "    WINDOW_STRIDE, FEATURE_BIN_COUNT, PREPROCESS)\n",
+        "audio_processor = input_data.AudioProcessor(\n",
+        "    DATA_URL, DATASET_DIR,\n",
+        "    SILENT_PERCENTAGE, UNKNOWN_PERCENTAGE,\n",
+        "    WANTED_WORDS.split(','), VALIDATION_PERCENTAGE,\n",
+        "    TESTING_PERCENTAGE, model_settings, LOGS_DIR)"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "lBj_AyCh1cC0",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "with tf.Session() as sess:\n",
+        "  float_converter = tf.lite.TFLiteConverter.from_saved_model(SAVED_MODEL)\n",
+        "  float_tflite_model = float_converter.convert()\n",
+        "  float_tflite_model_size = open(FLOAT_MODEL_TFLITE, \"wb\").write(float_tflite_model)\n",
+        "  print(\"Float model is %d bytes\" % float_tflite_model_size)\n",
+        "\n",
+        "  converter = tf.lite.TFLiteConverter.from_saved_model(SAVED_MODEL)\n",
+        "  converter.optimizations = [tf.lite.Optimize.DEFAULT]\n",
+        "  converter.inference_input_type = tf.lite.constants.INT8\n",
+        "  converter.inference_output_type = tf.lite.constants.INT8\n",
+        "  def representative_dataset_gen():\n",
+        "    for i in range(100):\n",
+        "      data, _ = audio_processor.get_data(1, i*1, model_settings,\n",
+        "                                         BACKGROUND_FREQUENCY, \n",
+        "                                         BACKGROUND_VOLUME_RANGE,\n",
+        "                                         TIME_SHIFT_MS,\n",
+        "                                         'testing',\n",
+        "                                         sess)\n",
+        "      flattened_data = np.array(data.flatten(), dtype=np.float32).reshape(1, 1960)\n",
+        "      yield [flattened_data]\n",
+        "  converter.representative_dataset = representative_dataset_gen\n",
+        "  tflite_model = converter.convert()\n",
+        "  tflite_model_size = open(MODEL_TFLITE, \"wb\").write(tflite_model)\n",
+        "  print(\"Quantized model is %d bytes\" % tflite_model_size)\n"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "EeLiDZTbLkzv",
+        "colab_type": "text"
+      },
+      "source": [
+        "## Testing the TensorFlow Lite model's accuracy\n",
+        "\n",
+        "Verify that the model we've exported is still accurate, using the TF Lite Python API and our test set."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "wQsEteKRLryJ",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "# Helper function to run inference\n",
+        "def run_tflite_inference(tflite_model_path, model_type=\"Float\"):\n",
+        "  # Load test data\n",
+        "  np.random.seed(0) # set random seed for reproducible test results.\n",
+        "  with tf.Session() as sess:\n",
+        "    test_data, test_labels = audio_processor.get_data(\n",
+        "        -1, 0, model_settings, BACKGROUND_FREQUENCY, BACKGROUND_VOLUME_RANGE,\n",
+        "        TIME_SHIFT_MS, 'testing', sess)\n",
+        "  test_data = np.expand_dims(test_data, axis=1).astype(np.float32)\n",
+        "\n",
+        "  # Initialize the interpreter\n",
+        "  interpreter = tf.lite.Interpreter(tflite_model_path)\n",
+        "  interpreter.allocate_tensors()\n",
+        "\n",
+        "  input_details = interpreter.get_input_details()[0]\n",
+        "  output_details = interpreter.get_output_details()[0]\n",
+        "\n",
+        "  # For quantized models, manually quantize the input data from float to integer\n",
+        "  if model_type == \"Quantized\":\n",
+        "    input_scale, input_zero_point = input_details[\"quantization\"]\n",
+        "    test_data = test_data / input_scale + input_zero_point\n",
+        "    test_data = test_data.astype(input_details[\"dtype\"])\n",
+        "\n",
+        "  correct_predictions = 0\n",
+        "  for i in range(len(test_data)):\n",
+        "    interpreter.set_tensor(input_details[\"index\"], test_data[i])\n",
+        "    interpreter.invoke()\n",
+        "    output = interpreter.get_tensor(output_details[\"index\"])[0]\n",
+        "    top_prediction = output.argmax()\n",
+        "    correct_predictions += (top_prediction == test_labels[i])\n",
+        "\n",
+        "  print('%s model accuracy is %f%% (Number of test samples=%d)' % (\n",
+        "      model_type, (correct_predictions * 100) / len(test_data), len(test_data)))"
+      ],
+      "execution_count": 110,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "l-pD52Na6jRa",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "# Compute float model accuracy\n",
+        "run_tflite_inference(FLOAT_MODEL_TFLITE)\n",
+        "\n",
+        "# Compute quantized model accuracy\n",
+        "run_tflite_inference(MODEL_TFLITE, model_type='Quantized')"
+      ],
+      "execution_count": 111,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "dt6Zqbxu-wIi",
+        "colab_type": "text"
+      },
+      "source": [
+        "## Generate a TensorFlow Lite for MicroControllers Model\n",
+        "Convert the TensorFlow Lite model into a C source file that can be loaded by TensorFlow Lite for Microcontrollers."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "XohZOTjR8ZyE",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "# Install xxd if it is not available\n",
+        "!apt-get update && apt-get -qq install xxd\n",
+        "# Convert to a C source file\n",
+        "!xxd -i {MODEL_TFLITE} > {MODEL_TFLITE_MICRO}\n",
+        "# Update variable names\n",
+        "REPLACE_TEXT = MODEL_TFLITE.replace('/', '_').replace('.', '_')\n",
+        "!sed -i 's/'{REPLACE_TEXT}'/g_model/g' {MODEL_TFLITE_MICRO}"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "2pQnN0i_-0L2",
+        "colab_type": "text"
+      },
+      "source": [
+        "## Deploy to a Microcontroller\n",
+        "\n",
+        "Follow the instructions in the [micro_speech](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/micro/examples/micro_speech) README.md for [TensorFlow Lite for MicroControllers](https://www.tensorflow.org/lite/microcontrollers/overview) to deploy this model on a specific microcontroller.\n",
+        "\n",
+        "**Reference Model:** If you have not modified this notebook, you can follow the instructions as is, to deploy the model. Refer to the [`micro_speech/train/models`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/examples/micro_speech/train/models) directory to access the models generated in this notebook.\n",
+        "\n",
+        "**New Model:** If you have generated a new model to identify different words: (i) Update `kCategoryCount` and `kCategoryLabels` in [`micro_speech/micro_features/micro_model_settings.h`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/examples/micro_speech/micro_features/micro_model_settings.h) and (ii) Update the values assigned to the variables defined in [`micro_speech/micro_features/model.cc`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/examples/micro_speech/micro_features/model.cc) with values displayed after running the following cell."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "eoYyh0VU8pca",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "# Print the C source file\n",
+        "!cat {MODEL_TFLITE_MICRO}"
+      ],
+      "execution_count": null,
+      "outputs": []
+    }
+  ]
+}
\ No newline at end of file
diff --git a/tensorflow/lite/micro/examples/person_detection/main_functions.cc b/tensorflow/lite/micro/examples/person_detection/main_functions.cc
index aa4d83a3334..d7e9f6826c4 100644
--- a/tensorflow/lite/micro/examples/person_detection/main_functions.cc
+++ b/tensorflow/lite/micro/examples/person_detection/main_functions.cc
@@ -66,13 +66,9 @@ void setup() {
   // tflite::AllOpsResolver resolver;
   // NOLINTNEXTLINE(runtime-global-variables)
   static tflite::MicroMutableOpResolver<3> micro_op_resolver;
-  micro_op_resolver.AddBuiltin(
-      tflite::BuiltinOperator_DEPTHWISE_CONV_2D,
-      tflite::ops::micro::Register_DEPTHWISE_CONV_2D());
-  micro_op_resolver.AddBuiltin(tflite::BuiltinOperator_CONV_2D,
-                               tflite::ops::micro::Register_CONV_2D());
-  micro_op_resolver.AddBuiltin(tflite::BuiltinOperator_AVERAGE_POOL_2D,
-                               tflite::ops::micro::Register_AVERAGE_POOL_2D());
+  micro_op_resolver.AddAveragePool2D();
+  micro_op_resolver.AddConv2D();
+  micro_op_resolver.AddDepthwiseConv2D();
 
   // Build an interpreter to run the model with.
   static tflite::MicroInterpreter static_interpreter(
diff --git a/tensorflow/lite/micro/examples/person_detection/person_detection_test.cc b/tensorflow/lite/micro/examples/person_detection/person_detection_test.cc
index bc53a8410da..7e706d49fcc 100644
--- a/tensorflow/lite/micro/examples/person_detection/person_detection_test.cc
+++ b/tensorflow/lite/micro/examples/person_detection/person_detection_test.cc
@@ -57,13 +57,9 @@ TF_LITE_MICRO_TEST(TestInvoke) {
   //
   // tflite::AllOpsResolver resolver;
   tflite::MicroMutableOpResolver<3> micro_op_resolver;
-  micro_op_resolver.AddBuiltin(
-      tflite::BuiltinOperator_DEPTHWISE_CONV_2D,
-      tflite::ops::micro::Register_DEPTHWISE_CONV_2D());
-  micro_op_resolver.AddBuiltin(tflite::BuiltinOperator_CONV_2D,
-                               tflite::ops::micro::Register_CONV_2D());
-  micro_op_resolver.AddBuiltin(tflite::BuiltinOperator_AVERAGE_POOL_2D,
-                               tflite::ops::micro::Register_AVERAGE_POOL_2D());
+  micro_op_resolver.AddAveragePool2D();
+  micro_op_resolver.AddConv2D();
+  micro_op_resolver.AddDepthwiseConv2D();
 
   // Build an interpreter to run the model with.
   tflite::MicroInterpreter interpreter(model, micro_op_resolver, tensor_arena,
diff --git a/tensorflow/lite/micro/examples/person_detection_experimental/README.md b/tensorflow/lite/micro/examples/person_detection_experimental/README.md
index bf99b40d776..4312e3a8b8b 100644
--- a/tensorflow/lite/micro/examples/person_detection_experimental/README.md
+++ b/tensorflow/lite/micro/examples/person_detection_experimental/README.md
@@ -10,6 +10,7 @@ This uses the experimental int8 quantized version of the person detection model.
 -   [Getting started](#getting-started)
 -   [Running on ARC EM SDP](#running-on-arc-em-sdp)
 -   [Running on Arduino](#running-on-arduino)
+-   [Running on HIMAX WE1 EVB](#running-on-himax-we1-evb)
 -   [Running on SparkFun Edge](#running-on-sparkfun-edge)
 -   [Run the tests on a development machine](#run-the-tests-on-a-development-machine)
 -   [Debugging image capture](#debugging-image-capture)
@@ -260,6 +261,105 @@ From the log, we can see that it took around 170 ms to capture and read the
 image data from the camera module, 180 ms to decode the JPEG and convert it to
 greyscale, and 18.6 seconds to run inference.
 
+## Running on HIMAX WE1 EVB
+
+The following instructions will help you build and deploy this example to
+[HIMAX WE1 EVB](https://github.com/HimaxWiseEyePlus/bsp_tflu/tree/master/HIMAX_WE1_EVB_board_brief)
+board. To undstand more about using this board, please check
+[HIMAX WE1 EVB user guide](https://github.com/HimaxWiseEyePlus/bsp_tflu/tree/master/HIMAX_WE1_EVB_user_guide).
+
+### Initial Setup
+
+To use the HIMAX WE1 EVB, please make sure following software are installed:
+
+#### MetaWare Development Toolkit
+
+See
+[Install the Synopsys DesignWare ARC MetaWare Development Toolkit](/tensorflow/lite/micro/tools/make/targets/arc/README.md#install-the-synopsys-designware-arc-metaware-development-toolkit)
+section for instructions on toolchain installation.
+
+#### Make Tool version
+
+A `'make'` tool is required for deploying Tensorflow Lite Micro applications on
+HIMAX WE1 EVB, See
+[Check make tool version](/tensorflow/lite/micro/tools/make/targets/arc/README.md#make-tool)
+section for proper environment.
+
+#### Serial Terminal Emulation Application
+
+There are 2 main purposes for HIMAX WE1 EVB Debug UART port
+
+-   print application output
+-   burn application to flash by using xmodem send application binary
+
+You can use any terminal emulation program (like [PuTTY](https://www.putty.org/)
+or [minicom](https://linux.die.net/man/1/minicom)).
+
+### Generate Example Project
+
+The example project for HIMAX WE1 EVB platform can be generated with the
+following command:
+
+Download related third party data
+
+```
+make -f tensorflow/lite/micro/tools/make/Makefile TARGET=himax_we1_evb third_party_downloads
+```
+
+Generate person detection project
+
+```
+make -f tensorflow/lite/micro/tools/make/Makefile generate_person_detection_int8_make_project TARGET=himax_we1_evb
+```
+
+### Build and Burn Example
+
+Following the Steps to run person detection example at HIMAX WE1 EVB platform.
+
+1.  Go to the generated example project directory.
+
+    ```
+    cd tensorflow/lite/micro/tools/make/gen/himax_we1_evb_arc/prj/person_detection_int8/make
+    ```
+
+2.  Build the example using
+
+    ```
+    make app
+    ```
+
+3.  After example build finish, copy ELF file and map file to image generate
+    tool directory. \
+    image generate tool directory located at
+    `'tensorflow/lite/micro/tools/make/downloads/himax_we1_sdk/image_gen_linux_v3/'`
+
+    ```
+    cp person_detection_int8.elf himax_we1_evb.map ../../../../../downloads/himax_we1_sdk/image_gen_linux_v3/
+    ```
+
+4.  Go to flash image generate tool directory.
+
+    ```
+    cd ../../../../../downloads/himax_we1_sdk/image_gen_linux_v3/
+    ```
+
+5.  run image generate tool, generate flash image file.
+
+    *   Before running image generate tool, by typing `sudo chmod +x image_gen`
+        and `sudo chmod +x sign_tool` to make sure it is executable.
+
+    ```
+    image_gen -e person_detection_int8.elf -m himax_we1_evb.map -o out.img
+    ```
+
+6.  Download flash image file to HIMAX WE1 EVB by UART:
+
+    *   more detail about download image through UART can be found at
+        [HIMAX WE1 EVB update Flash image](https://github.com/HimaxWiseEyePlus/bsp_tflu/tree/master/HIMAX_WE1_EVB_user_guide#flash-image-update)
+
+After these steps, press reset button on the HIMAX WE1 EVB, you will see
+application output in the serial terminal.
+
 ## Running on SparkFun Edge
 
 The following instructions will help you build and deploy this sample on the
diff --git a/tensorflow/lite/micro/examples/person_detection_experimental/himax_we1_evb/detection_responder.cc b/tensorflow/lite/micro/examples/person_detection_experimental/himax_we1_evb/detection_responder.cc
new file mode 100644
index 00000000000..ae5de962fd3
--- /dev/null
+++ b/tensorflow/lite/micro/examples/person_detection_experimental/himax_we1_evb/detection_responder.cc
@@ -0,0 +1,33 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/micro/examples/person_detection_experimental/detection_responder.h"
+
+#include "hx_drv_tflm.h"
+
+// This dummy implementation writes person and no person scores to the error
+// console. Real applications will want to take some custom action instead, and
+// should implement their own versions of this function.
+void RespondToDetection(tflite::ErrorReporter* error_reporter,
+                        int8_t person_score, int8_t no_person_score) {
+  if (person_score > no_person_score) {
+    hx_drv_led_on(HX_DRV_LED_GREEN);
+  } else {
+    hx_drv_led_off(HX_DRV_LED_GREEN);
+  }
+
+  TF_LITE_REPORT_ERROR(error_reporter, "person score:%d no person score %d",
+                       person_score, no_person_score);
+}
diff --git a/tensorflow/lite/micro/examples/person_detection_experimental/himax_we1_evb/image_provider.cc b/tensorflow/lite/micro/examples/person_detection_experimental/himax_we1_evb/image_provider.cc
new file mode 100644
index 00000000000..871a40a867d
--- /dev/null
+++ b/tensorflow/lite/micro/examples/person_detection_experimental/himax_we1_evb/image_provider.cc
@@ -0,0 +1,41 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/micro/examples/person_detection_experimental/image_provider.h"
+
+#include "hx_drv_tflm.h"
+#include "tensorflow/lite/micro/examples/person_detection_experimental/model_settings.h"
+
+hx_drv_sensor_image_config_t g_pimg_config;
+
+TfLiteStatus GetImage(tflite::ErrorReporter* error_reporter, int image_width,
+                      int image_height, int channels, int8_t* image_data) {
+  static bool is_initialized = false;
+
+  if (!is_initialized) {
+    if (hx_drv_sensor_initial(&g_pimg_config) != HX_DRV_LIB_PASS) {
+      return kTfLiteError;
+    }
+    is_initialized = true;
+  }
+
+  hx_drv_sensor_capture(&g_pimg_config);
+
+  hx_drv_image_rescale((uint8_t*)g_pimg_config.raw_address,
+                       g_pimg_config.img_width, g_pimg_config.img_height,
+                       image_data, image_width, image_height);
+
+  return kTfLiteOk;
+}
diff --git a/tensorflow/lite/micro/examples/person_detection_experimental/main_functions.cc b/tensorflow/lite/micro/examples/person_detection_experimental/main_functions.cc
index ac47e36ff8f..09a9cb2c6c4 100644
--- a/tensorflow/lite/micro/examples/person_detection_experimental/main_functions.cc
+++ b/tensorflow/lite/micro/examples/person_detection_experimental/main_functions.cc
@@ -73,17 +73,11 @@ void setup() {
   // tflite::AllOpsResolver resolver;
   // NOLINTNEXTLINE(runtime-global-variables)
   static tflite::MicroMutableOpResolver<5> micro_op_resolver;
-  micro_op_resolver.AddBuiltin(
-      tflite::BuiltinOperator_DEPTHWISE_CONV_2D,
-      tflite::ops::micro::Register_DEPTHWISE_CONV_2D());
-  micro_op_resolver.AddBuiltin(tflite::BuiltinOperator_CONV_2D,
-                               tflite::ops::micro::Register_CONV_2D());
-  micro_op_resolver.AddBuiltin(tflite::BuiltinOperator_AVERAGE_POOL_2D,
-                               tflite::ops::micro::Register_AVERAGE_POOL_2D());
-  micro_op_resolver.AddBuiltin(tflite::BuiltinOperator_RESHAPE,
-                               tflite::ops::micro::Register_RESHAPE());
-  micro_op_resolver.AddBuiltin(tflite::BuiltinOperator_SOFTMAX,
-                               tflite::ops::micro::Register_SOFTMAX());
+  micro_op_resolver.AddAveragePool2D();
+  micro_op_resolver.AddConv2D();
+  micro_op_resolver.AddDepthwiseConv2D();
+  micro_op_resolver.AddReshape();
+  micro_op_resolver.AddSoftmax();
 
   // Build an interpreter to run the model with.
   // NOLINTNEXTLINE(runtime-global-variables)
diff --git a/tensorflow/lite/micro/examples/person_detection_experimental/person_detection_test.cc b/tensorflow/lite/micro/examples/person_detection_experimental/person_detection_test.cc
index ddec8951596..270a427b1df 100644
--- a/tensorflow/lite/micro/examples/person_detection_experimental/person_detection_test.cc
+++ b/tensorflow/lite/micro/examples/person_detection_experimental/person_detection_test.cc
@@ -53,17 +53,11 @@ TF_LITE_MICRO_TEST(TestInvoke) {
   // incur some penalty in code space for op implementations that are not
   // needed by this graph.
   tflite::MicroMutableOpResolver<5> micro_op_resolver;
-  micro_op_resolver.AddBuiltin(
-      tflite::BuiltinOperator_DEPTHWISE_CONV_2D,
-      tflite::ops::micro::Register_DEPTHWISE_CONV_2D());
-  micro_op_resolver.AddBuiltin(tflite::BuiltinOperator_CONV_2D,
-                               tflite::ops::micro::Register_CONV_2D());
-  micro_op_resolver.AddBuiltin(tflite::BuiltinOperator_AVERAGE_POOL_2D,
-                               tflite::ops::micro::Register_AVERAGE_POOL_2D());
-  micro_op_resolver.AddBuiltin(tflite::BuiltinOperator_RESHAPE,
-                               tflite::ops::micro::Register_RESHAPE());
-  micro_op_resolver.AddBuiltin(tflite::BuiltinOperator_SOFTMAX,
-                               tflite::ops::micro::Register_SOFTMAX());
+  micro_op_resolver.AddAveragePool2D();
+  micro_op_resolver.AddConv2D();
+  micro_op_resolver.AddDepthwiseConv2D();
+  micro_op_resolver.AddReshape();
+  micro_op_resolver.AddSoftmax();
 
   // Build an interpreter to run the model with.
   tflite::MicroInterpreter interpreter(model, micro_op_resolver, tensor_arena,
diff --git a/tensorflow/core/kernels/cwise_op_bessel.cc b/tensorflow/lite/micro/himax_we1_evb/debug_log.cc
similarity index 55%
rename from tensorflow/core/kernels/cwise_op_bessel.cc
rename to tensorflow/lite/micro/himax_we1_evb/debug_log.cc
index dedc961ffea..5bc3c7fae35 100644
--- a/tensorflow/core/kernels/cwise_op_bessel.cc
+++ b/tensorflow/lite/micro/himax_we1_evb/debug_log.cc
@@ -13,17 +13,20 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/kernels/cwise_ops_common.h"
+// Implementation for the DebugLog() function that prints to the UART on the
+// SparkFun Edge microcontroller. The same should work for other targets using
+// the Ambiq Apollo 3.
 
-namespace tensorflow {
-REGISTER3(UnaryOp, CPU, "BesselI0e", functor::bessel_i0e, Eigen::half, float,
-          double);
-REGISTER3(UnaryOp, CPU, "BesselI1e", functor::bessel_i1e, Eigen::half, float,
-          double);
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-REGISTER3(UnaryOp, GPU, "BesselI0e", functor::bessel_i0e, Eigen::half, float,
-          double);
-REGISTER3(UnaryOp, GPU, "BesselI1e", functor::bessel_i1e, Eigen::half, float,
-          double);
-#endif
-}  // namespace tensorflow
+#include "tensorflow/lite/micro/debug_log.h"
+
+#include "hx_drv_tflm.h"
+
+extern "C" void DebugLog(const char* s) {
+  static bool is_initialized = false;
+  if (!is_initialized) {
+    hx_drv_uart_initial();
+    is_initialized = true;
+  }
+
+  hx_drv_uart_print("%s", s);
+}
diff --git a/tensorflow/lite/micro/kernels/BUILD b/tensorflow/lite/micro/kernels/BUILD
index c7fa19b8cea..0fd0be4e3a4 100644
--- a/tensorflow/lite/micro/kernels/BUILD
+++ b/tensorflow/lite/micro/kernels/BUILD
@@ -20,7 +20,6 @@ package_group(
     packages = ["//tensorflow/lite/micro"],
 )
 
-# LINT.IfChange(micro_ops)
 cc_library(
     name = "micro_ops",
     srcs = [
@@ -106,73 +105,6 @@ cc_library(
         ],
     }),
 )
-# LINT.ThenChange(//tensorflow/lite/micro/kernels/BUILD:portable_optimized_micro_ops)
-
-# LINT.IfChange(portable_optimized_micro_ops)
-cc_library(
-    name = "portable_optimized_micro_ops",
-    srcs = [
-        "activations.cc",
-        "add.cc",
-        "arg_min_max.cc",
-        "ceil.cc",
-        "circular_buffer.cc",
-        "comparisons.cc",
-        "concatenation.cc",
-        "conv.cc",
-        "dequantize.cc",
-        "elementwise.cc",
-        "ethosu.cc",
-        "floor.cc",
-        "fully_connected.cc",
-        "l2norm.cc",
-        "logical.cc",
-        "logistic.cc",
-        "maximum_minimum.cc",
-        "mul.cc",
-        "neg.cc",
-        "pack.cc",
-        "pad.cc",
-        "pooling.cc",
-        "portable_optimized/depthwise_conv.cc",
-        "prelu.cc",
-        "quantize.cc",
-        "reduce.cc",
-        "reshape.cc",
-        "resize_nearest_neighbor.cc",
-        "round.cc",
-        "softmax.cc",
-        "split.cc",
-        "strided_slice.cc",
-        "sub.cc",
-        "svdf.cc",
-        "tanh.cc",
-        "unpack.cc",
-    ],
-    hdrs = ["micro_ops.h"],
-    copts = micro_copts(),
-    visibility = [
-        # Needed for micro:portable_optimized_ops_resolver but visibility can not be
-        # finer-grained than a package.
-        ":micro_top_level",
-    ],
-    deps = [
-        ":activation_utils",
-        ":micro_utils",
-        "//tensorflow/lite/c:common",
-        "//tensorflow/lite/kernels:kernel_util",
-        "//tensorflow/lite/kernels:op_macros",
-        "//tensorflow/lite/kernels:padding",
-        "//tensorflow/lite/kernels/internal:common",
-        "//tensorflow/lite/kernels/internal:compatibility",
-        "//tensorflow/lite/kernels/internal:quantization_util",
-        "//tensorflow/lite/kernels/internal:reference_base",
-        "//tensorflow/lite/kernels/internal:tensor",
-        "//tensorflow/lite/kernels/internal:types",
-        "//tensorflow/lite/micro:micro_utils",
-    ],
-)
-# LINT.ThenChange(//tensorflow/lite/micro/kernels/BUILD:micro_ops)
 
 test_suite(
     name = "all_tests",
@@ -214,19 +146,6 @@ tflite_micro_cc_test(
     ],
 )
 
-tflite_micro_cc_test(
-    name = "portable_optimized_depthwise_conv_test",
-    srcs = [
-        "depthwise_conv_test.cc",
-    ],
-    deps = [
-        "//tensorflow/lite/c:common",
-        "//tensorflow/lite/kernels/internal:tensor",
-        "//tensorflow/lite/micro:portable_optimized_op_resolver",
-        "//tensorflow/lite/micro/testing:micro_test",
-    ],
-)
-
 tflite_micro_cc_test(
     name = "fully_connected_test",
     srcs = [
diff --git a/tensorflow/lite/micro/kernels/activation_utils.h b/tensorflow/lite/micro/kernels/activation_utils.h
index a71826211c0..95ecc26dd52 100644
--- a/tensorflow/lite/micro/kernels/activation_utils.h
+++ b/tensorflow/lite/micro/kernels/activation_utils.h
@@ -35,7 +35,7 @@ inline float ActivationValFloat(TfLiteFusedActivation act, float a) {
       return a;
     case kTfLiteActRelu:
       return TfLiteMax(0.0f, a);
-    case kTfLiteActRelu1:
+    case kTfLiteActReluN1To1:
       return TfLiteMax(-1.0f, TfLiteMin(a, 1.0f));
     case kTfLiteActRelu6:
       return TfLiteMax(0.0f, TfLiteMin(a, 6.0f));
diff --git a/tensorflow/lite/micro/kernels/add_test.cc b/tensorflow/lite/micro/kernels/add_test.cc
index 60164ab4746..6c66e0d4aaf 100644
--- a/tensorflow/lite/micro/kernels/add_test.cc
+++ b/tensorflow/lite/micro/kernels/add_test.cc
@@ -201,7 +201,7 @@ TF_LITE_MICRO_TEST(FloatAddActivationRelu1) {
   float output_data[output_dims_count];
   tflite::testing::TestAddFloat(inout_shape, input1_values, inout_shape,
                                 input2_values, inout_shape, golden_values,
-                                kTfLiteActRelu1, output_data);
+                                kTfLiteActReluN1To1, output_data);
 }
 
 TF_LITE_MICRO_TEST(FloatAddVariousInputShapes) {
@@ -313,7 +313,7 @@ TF_LITE_MICRO_TEST(QuantizedAddActivationRelu1Uint8) {
       inout_shape, input1_values, input1_quantized, scales[0], zero_points[0],
       inout_shape, input2_values, input2_quantized, scales[1], zero_points[1],
       inout_shape, golden_values, golden_quantized, scales[2], zero_points[2],
-      kTfLiteActRelu1, output);
+      kTfLiteActReluN1To1, output);
 }
 
 TF_LITE_MICRO_TEST(QuantizedAddActivationRelu1Int8) {
@@ -334,7 +334,7 @@ TF_LITE_MICRO_TEST(QuantizedAddActivationRelu1Int8) {
       inout_shape, input1_values, input1_quantized, scales[0], zero_points[0],
       inout_shape, input2_values, input2_quantized, scales[1], zero_points[1],
       inout_shape, golden_values, golden_quantized, scales[2], zero_points[2],
-      kTfLiteActRelu1, output);
+      kTfLiteActReluN1To1, output);
 }
 
 TF_LITE_MICRO_TEST(QuantizedAddVariousInputShapesUint8) {
diff --git a/tensorflow/lite/micro/kernels/cmsis-nn/conv.cc b/tensorflow/lite/micro/kernels/cmsis-nn/conv.cc
index 3accd34f563..84f36552b63 100644
--- a/tensorflow/lite/micro/kernels/cmsis-nn/conv.cc
+++ b/tensorflow/lite/micro/kernels/cmsis-nn/conv.cc
@@ -115,7 +115,7 @@ void* Init(TfLiteContext* context, const char* buffer, size_t length) {
 }
 
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
-#if defined(__ARM_FEATURE_DSP)
+#if defined(__ARM_FEATURE_DSP) || defined(__ARM_FEATURE_MVE)
   OpData data;
   int32_t buf_size = 0;
 
@@ -240,7 +240,7 @@ TfLiteStatus EvalQuantizedPerChannel(
   quant_params.multiplier = data->per_channel_output_multiplier;
   quant_params.shift = data->per_channel_output_shift;
 
-#if defined(__ARM_FEATURE_DSP)
+#if defined(__ARM_FEATURE_DSP) || defined(__ARM_FEATURE_MVE)
   RuntimeShape filter_shape = GetTensorShape(filter);
   RuntimeShape input_shape = GetTensorShape(input);
   RuntimeShape output_shape = GetTensorShape(output);
diff --git a/tensorflow/lite/micro/kernels/cmsis-nn/depthwise_conv.cc b/tensorflow/lite/micro/kernels/cmsis-nn/depthwise_conv.cc
index fa5f61f7af9..4d398855abc 100644
--- a/tensorflow/lite/micro/kernels/cmsis-nn/depthwise_conv.cc
+++ b/tensorflow/lite/micro/kernels/cmsis-nn/depthwise_conv.cc
@@ -104,7 +104,7 @@ void* Init(TfLiteContext* context, const char* buffer, size_t length) {
 }
 
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
-#if defined(__ARM_FEATURE_DSP)
+#if defined(__ARM_FEATURE_DSP) || defined(__ARM_FEATURE_MVE)
   auto* params =
       reinterpret_cast<TfLiteDepthwiseConvParams*>(node->builtin_data);
 
@@ -186,7 +186,7 @@ TfLiteStatus EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
   op_params.quantized_activation_min = std::numeric_limits<int8_t>::min();
   op_params.quantized_activation_max = std::numeric_limits<int8_t>::max();
 
-#if defined(__ARM_FEATURE_DSP)
+#if defined(__ARM_FEATURE_DSP) || defined(__ARM_FEATURE_MVE)
   RuntimeShape filter_shape = GetTensorShape(filter);
   const int filter_height = filter_shape.Dims(1);
   const int filter_width = filter_shape.Dims(2);
@@ -284,7 +284,7 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
   // Legacy ops used mixed left and right shifts. Now all are +ve-means-left.
   op_params.output_shift = -data->output_shift;
 
-#if defined(__ARM_FEATURE_DSP)
+#if defined(__ARM_FEATURE_DSP) || defined(__ARM_FEATURE_MVE)
   // optimizations utilize loop unrolling which requires the following power
   // of two kernel dimensions
   RuntimeShape filter_shape = GetTensorShape(filter);
diff --git a/tensorflow/lite/micro/kernels/cmsis-nn/fully_connected.cc b/tensorflow/lite/micro/kernels/cmsis-nn/fully_connected.cc
index d8827b36d06..6ae3a14bc96 100644
--- a/tensorflow/lite/micro/kernels/cmsis-nn/fully_connected.cc
+++ b/tensorflow/lite/micro/kernels/cmsis-nn/fully_connected.cc
@@ -84,11 +84,11 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_TYPES_EQ(context, input->type, output->type);
   TF_LITE_ENSURE_MSG(context, input->type == filter->type,
                      "Hybrid models are not supported on TFLite Micro.");
-#if defined(__ARM_FEATURE_DSP)
+
+#if defined(__ARM_FEATURE_DSP) || defined(__ARM_FEATURE_MVE)
   RuntimeShape filter_shape = GetTensorShape(filter);
   const int filter_dim_count = filter_shape.DimensionsCount();
   const int accum_depth = filter_shape.Dims(filter_dim_count - 1);
-
   const int32_t buf_size = arm_fully_connected_s8_get_buffer_size(accum_depth);
 
   int* buffer_idx = reinterpret_cast<int*>(node->user_data);
@@ -101,6 +101,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     *buffer_idx = -1;
   }
 #endif
+
   return kTfLiteOk;
 }
 
@@ -116,7 +117,7 @@ TfLiteStatus EvalQuantizedInt8(TfLiteContext* context, TfLiteNode* node,
   const int filter_dim_count = filter_shape.DimensionsCount();
   const int accum_depth = filter_shape.Dims(filter_dim_count - 1);
 
-#if defined(__ARM_FEATURE_DSP)
+#if defined(__ARM_FEATURE_DSP) || defined(__ARM_FEATURE_MVE)
   int16_t* buf = nullptr;
 
   auto* buffer_idx = reinterpret_cast<int*>(node->user_data);
diff --git a/tensorflow/lite/micro/kernels/cmsis-nn/pooling.cc b/tensorflow/lite/micro/kernels/cmsis-nn/pooling.cc
index 001b0feaef2..42b3c2e52ff 100644
--- a/tensorflow/lite/micro/kernels/cmsis-nn/pooling.cc
+++ b/tensorflow/lite/micro/kernels/cmsis-nn/pooling.cc
@@ -106,7 +106,7 @@ TfLiteStatus AverageEvalInt8(TfLiteContext* context, const TfLiteNode* node,
 
   TFLITE_DCHECK_LE(activation_min, activation_max);
 
-#if defined(__ARM_FEATURE_DSP)
+#if defined(__ARM_FEATURE_DSP) || defined(__ARM_FEATURE_MVE)
   RuntimeShape input_shape = GetTensorShape(input);
   TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
 
@@ -283,7 +283,7 @@ void* Init(TfLiteContext* context, const char* buffer, size_t length) {
 }
 
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
-#if defined(__ARM_FEATURE_DSP)
+#if defined(__ARM_FEATURE_DSP) || defined(__ARM_FEATURE_MVE)
   const TfLiteTensor* input = GetInput(context, node, kInputTensor);
   const TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
diff --git a/tensorflow/lite/micro/kernels/mul_test.cc b/tensorflow/lite/micro/kernels/mul_test.cc
index 6b4d4f07b64..f69bf2aa17e 100644
--- a/tensorflow/lite/micro/kernels/mul_test.cc
+++ b/tensorflow/lite/micro/kernels/mul_test.cc
@@ -402,7 +402,7 @@ TF_LITE_MICRO_TEST(FloatRelu) {
       {0.1, 0.2, 0.3, 0.5},     // input2 data
       {4, 1, 2, 2, 1},          // output shape
       {-0.2, 0.04, 0.21, 0.4},  // expected output data
-      output_data, kTfLiteActRelu1);
+      output_data, kTfLiteActReluN1To1);
 }
 
 TF_LITE_MICRO_TEST(FloatBroadcast) {
diff --git a/tensorflow/lite/micro/kernels/pooling_test.cc b/tensorflow/lite/micro/kernels/pooling_test.cc
index 9e11e9a4d57..35a77662e07 100644
--- a/tensorflow/lite/micro/kernels/pooling_test.cc
+++ b/tensorflow/lite/micro/kernels/pooling_test.cc
@@ -417,7 +417,8 @@ TF_LITE_MICRO_TEST(SimpleAveragePoolTestInt8PaddingValidStride1Stride2Relu) {
       kTfLitePaddingValid, kTfLiteActRelu, output_data);
 }
 
-TF_LITE_MICRO_TEST(SimpleAveragePoolTestInt8PaddingValidStride2Stride1Relu1) {
+TF_LITE_MICRO_TEST(
+    SimpleAveragePoolTestInt8PaddingValidStride2Stride1ReluN1To1) {
   using tflite::testing::F2QS;
 
   const float input_min = -15.9375;
@@ -439,7 +440,7 @@ TF_LITE_MICRO_TEST(SimpleAveragePoolTestInt8PaddingValidStride2Stride1Relu1) {
        F2QS(-0.25, output_min, output_max), F2QS(0.75, output_min, output_max)},
       {4, 1, 1, 2, 1},         // Output shape
       output_min, output_max,  // output quantization range
-      kTfLitePaddingValid, kTfLiteActRelu1, output_data);
+      kTfLitePaddingValid, kTfLiteActReluN1To1, output_data);
 }
 
 TF_LITE_MICRO_TEST(SimpleAveragePoolTestInt8PaddingValidStride2Relu6) {
@@ -532,7 +533,7 @@ TF_LITE_MICRO_TEST(SimpleMaxPoolTestFloatRelu) {
                                     output_data);
 }
 
-TF_LITE_MICRO_TEST(SimpleMaxPoolTestFloatRelu1) {
+TF_LITE_MICRO_TEST(SimpleMaxPoolTestFloatReluN1To1) {
   float output_data[2];
   tflite::testing::TestMaxPoolFloat({4, 1, 2, 4, 1},  // Input shape
                                     {
@@ -548,7 +549,7 @@ TF_LITE_MICRO_TEST(SimpleMaxPoolTestFloatRelu1) {
                                         0.7,
                                     },
                                     {4, 1, 1, 2, 1},  // Output shape
-                                    kTfLitePaddingValid, kTfLiteActRelu1,
+                                    kTfLitePaddingValid, kTfLiteActReluN1To1,
                                     output_data);
 
   tflite::testing::TestMaxPoolFloat({4, 1, 2, 4, 1},  // Input shape
@@ -565,7 +566,7 @@ TF_LITE_MICRO_TEST(SimpleMaxPoolTestFloatRelu1) {
                                         1.0,
                                     },
                                     {4, 1, 1, 2, 1},  // Output shape
-                                    kTfLitePaddingValid, kTfLiteActRelu1,
+                                    kTfLitePaddingValid, kTfLiteActReluN1To1,
                                     output_data);
 }
 
@@ -713,7 +714,7 @@ TF_LITE_MICRO_TEST(MaxPoolTestUInt8ActRelu) {
       kTfLitePaddingValid, kTfLiteActRelu, output_data);
 }
 
-TF_LITE_MICRO_TEST(MaxPoolTestUInt8ActRelu1) {
+TF_LITE_MICRO_TEST(MaxPoolTestUInt8ActReluN1To1) {
   using tflite::testing::F2Q;
 
   uint8_t output_data[2];
@@ -743,7 +744,7 @@ TF_LITE_MICRO_TEST(MaxPoolTestUInt8ActRelu1) {
       {// Output values
        F2Q(-1.0, output_min, output_max), F2Q(1.0, output_min, output_max)},
       output_min, output_max, {4, 1, 1, 2, 1},  // Output shape
-      kTfLitePaddingValid, kTfLiteActRelu1, output_data);
+      kTfLitePaddingValid, kTfLiteActReluN1To1, output_data);
 }
 
 TF_LITE_MICRO_TEST(MaxPoolTestUInt8ActRelu6) {
@@ -944,7 +945,7 @@ TF_LITE_MICRO_TEST(MaxPoolTestUInt8ActRelu) {
       kTfLitePaddingValid, kTfLiteActRelu, output_data);
 }
 
-TF_LITE_MICRO_TEST(MaxPoolTestUInt8ActRelu1) {
+TF_LITE_MICRO_TEST(MaxPoolTestUInt8ActReluN1To1) {
   using tflite::testing::F2QS;
 
   int8_t output_data[2];
@@ -974,7 +975,7 @@ TF_LITE_MICRO_TEST(MaxPoolTestUInt8ActRelu1) {
       {// Output values
        F2QS(-1.0, output_min, output_max), F2QS(1.0, output_min, output_max)},
       output_min, output_max, {4, 1, 1, 2, 1},  // Output shape
-      kTfLitePaddingValid, kTfLiteActRelu1, output_data);
+      kTfLitePaddingValid, kTfLiteActReluN1To1, output_data);
 }
 
 TF_LITE_MICRO_TEST(MaxPoolTestUInt8ActRelu6) {
diff --git a/tensorflow/lite/micro/kernels/portable_optimized/depthwise_conv.cc b/tensorflow/lite/micro/kernels/portable_optimized/depthwise_conv.cc
deleted file mode 100644
index 9fb8f2e32cc..00000000000
--- a/tensorflow/lite/micro/kernels/portable_optimized/depthwise_conv.cc
+++ /dev/null
@@ -1,515 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h"
-
-#include "tensorflow/lite/c/builtin_op_data.h"
-#include "tensorflow/lite/c/common.h"
-#include "tensorflow/lite/kernels/internal/common.h"
-#include "tensorflow/lite/kernels/internal/quantization_util.h"
-#include "tensorflow/lite/kernels/internal/reference/depthwiseconv_float.h"
-#include "tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h"
-#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
-#include "tensorflow/lite/kernels/kernel_util.h"
-#include "tensorflow/lite/kernels/padding.h"
-
-namespace tflite {
-namespace ops {
-namespace micro {
-namespace depthwise_conv {
-namespace {
-
-constexpr int kInputTensor = 0;
-constexpr int kFilterTensor = 1;
-constexpr int kBiasTensor = 2;
-constexpr int kOutputTensor = 0;
-constexpr int kMaxChannels = 256;
-
-// Depthwise conv is quantized along dimension 3:
-// https://www.tensorflow.org/lite/performance/quantization_spec
-constexpr int kDepthwiseConvQuantizedDimension = 3;
-
-// Size of the cached buffer we'll be using to hold reordered weights.
-constexpr int kReshapedFilterDataSize = 1 * 1024;
-
-struct OpData {
-  TfLitePaddingValues padding;
-  // The scaling factor from input to output (aka the 'real multiplier') can
-  // be represented as a fixed point multiplier plus a left shift.
-  int32_t output_multiplier;
-  int output_shift;
-
-  // Per channel output multiplier and shift.
-  int32_t per_channel_output_multiplier[kMaxChannels];
-  int32_t per_channel_output_shift[kMaxChannels];
-
-  // The range of the fused activation layer. For example for kNone and
-  // uint8_t these would be 0 and 255.
-  int32_t output_activation_min;
-  int32_t output_activation_max;
-};
-
-TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node,
-                             TfLiteDepthwiseConvParams* params, int width,
-                             int height, int filter_width, int filter_height,
-                             int out_width, int out_height,
-                             const TfLiteType data_type, OpData* data) {
-  bool has_bias = node->inputs->size == 3;
-  // Check number of inputs/outputs
-  TF_LITE_ENSURE(context, has_bias || node->inputs->size == 2);
-  TF_LITE_ENSURE_EQ(context, node->outputs->size, 1);
-
-  // Matching GetWindowedOutputSize in TensorFlow.
-  auto padding = params->padding;
-  data->padding = ComputePaddingHeightWidth(
-      params->stride_height, params->stride_width,
-      params->dilation_height_factor, params->dilation_width_factor, height,
-      width, filter_height, filter_width, padding, &out_height, &out_width);
-
-  // Note that quantized inference requires that all tensors have their
-  // parameters set. This is usually done during quantized training.
-  if (data_type != kTfLiteFloat32) {
-    const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-    const TfLiteTensor* filter = GetInput(context, node, kFilterTensor);
-    const TfLiteTensor* bias =
-        GetOptionalInputTensor(context, node, kBiasTensor);
-    TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
-    int num_channels = filter->dims->data[kDepthwiseConvQuantizedDimension];
-
-    TF_LITE_ENSURE_STATUS(tflite::PopulateConvolutionQuantizationParams(
-        context, input, filter, bias, output, params->activation,
-        &data->output_multiplier, &data->output_shift,
-        &data->output_activation_min, &data->output_activation_max,
-        data->per_channel_output_multiplier,
-        reinterpret_cast<int*>(data->per_channel_output_shift), num_channels));
-  }
-  return kTfLiteOk;
-}
-
-// Specialized implementation of the depthwise convolution operation designed to
-// work with the particular filter width of eight used by the default micro
-// speech sample code. It uses 1KB of RAM to hold reordered weight parameters,
-// converted from TFLite's NHWC format to NCHW format, and expressed as signed
-// eight bit integers, rather than unsigned. Care must be taken when calling
-// this not to use it for more than one node since there's only a single static
-// buffer holding the weights. You should use this implementation if depthwise
-// convolutions are a performance bottleneck, you have a layer that meets the
-// parameter requirements, and the extra RAM usage and additional code size are
-// not an issue.
-static inline void DepthwiseConvOptimizedForFilterWidthEight(
-    TfLiteContext* context, const DepthwiseParams& params,
-    const RuntimeShape& input_shape, const uint8* input_data,
-    const RuntimeShape& filter_shape, const uint8* filter_data,
-    const RuntimeShape& bias_shape, const int32* bias_data,
-    const RuntimeShape& output_shape, uint8* output_data) {
-  const int stride_width = params.stride_width;
-  const int stride_height = params.stride_height;
-  const int pad_width = params.padding_values.width;
-  const int pad_height = params.padding_values.height;
-  const int depth_multiplier = params.depth_multiplier;
-  const int32 output_activation_min = params.quantized_activation_min;
-  const int32 output_activation_max = params.quantized_activation_max;
-  const int32 input_offset = params.input_offset;
-  const int32 filter_offset = params.weights_offset;
-  const int32 output_offset = params.output_offset;
-  const int32 output_multiplier = params.output_multiplier;
-  const int output_shift = params.output_shift;
-  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
-  TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
-  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
-
-  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
-  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
-  const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
-  const int input_height = input_shape.Dims(1);
-  const int input_width = input_shape.Dims(2);
-  const int input_depth = input_shape.Dims(3);
-  const int filter_height = filter_shape.Dims(1);
-  const int filter_width = filter_shape.Dims(2);
-  const int output_height = output_shape.Dims(1);
-  const int output_width = output_shape.Dims(2);
-  TFLITE_DCHECK_EQ(output_depth, input_depth * depth_multiplier);
-  TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
-
-  static int16_t reshaped_filter_data[kReshapedFilterDataSize];
-  const int needed_size =
-      output_depth * filter_width * filter_height * input_depth;
-  if (needed_size > kReshapedFilterDataSize) {
-    TF_LITE_KERNEL_LOG(
-        context,
-        "Size too large for reshaped weight buffer (%d needed, %d available)",
-        needed_size, kReshapedFilterDataSize);
-    return;
-  }
-
-  RuntimeShape reshaped_filter_shape;
-  reshaped_filter_shape.BuildFrom(
-      {1, output_depth, filter_height, filter_width});
-
-  // If this is the first time through, repack the weights into a cached buffer
-  // so that they can be accessed sequentially.
-  static bool is_reshaped_filter_initialized = false;
-  if (!is_reshaped_filter_initialized) {
-    for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
-      for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
-        for (int oc = 0; oc < output_depth; ++oc) {
-          const uint8* current_filter =
-              filter_data + Offset(filter_shape, 0, filter_y, filter_x, oc);
-          int16_t* reshaped_filter =
-              reshaped_filter_data +
-              Offset(reshaped_filter_shape, 0, oc, filter_y, filter_x);
-          *reshaped_filter =
-              static_cast<int16_t>(*current_filter) + filter_offset;
-        }
-      }
-    }
-    is_reshaped_filter_initialized = true;
-  }
-
-  for (int b = 0; b < batches; ++b) {
-    for (int out_y = 0; out_y < output_height; ++out_y) {
-      for (int out_x = 0; out_x < output_width; ++out_x) {
-        for (int ic = 0; ic < input_depth; ++ic) {
-          for (int m = 0; m < depth_multiplier; m++) {
-            const int oc = m + ic * depth_multiplier;
-            const int in_x_origin = (out_x * stride_width) - pad_width;
-            const int in_y_origin = (out_y * stride_height) - pad_height;
-            int32 acc = 0;
-            int in_y_start = in_y_origin;
-            int filter_y_start = 0;
-            if (in_y_origin < 0) {
-              in_y_start = 0;
-              filter_y_start = 0 - in_y_origin;
-            }
-            int filter_y_end = filter_height;
-            if ((in_y_origin + filter_height) >= input_height) {
-              filter_y_end -= (in_y_origin + filter_height) - input_height;
-            }
-            int in_y = in_y_start;
-            int in_x_start = in_x_origin;
-            int filter_x_start = 0;
-            bool is_out_of_x_bounds = false;
-            if (in_x_origin < 0) {
-              in_x_start = 0;
-              filter_x_start = 0 - in_x_origin;
-              is_out_of_x_bounds = true;
-            }
-            int filter_x_end = filter_width;
-            if ((in_x_origin + filter_width) >= input_width) {
-              filter_x_end -= (in_x_origin + filter_width) - input_width;
-              is_out_of_x_bounds = true;
-            }
-            for (int filter_y = filter_y_start; filter_y < filter_y_end;
-                 ++filter_y, ++in_y) {
-              const uint8* current_input =
-                  input_data + Offset(input_shape, b, in_y, in_x_start, ic);
-              if ((filter_width == 8) && !is_out_of_x_bounds) {
-                int16* current_filter =
-                    reshaped_filter_data + Offset(reshaped_filter_shape, 0, oc,
-                                                  filter_y, filter_x_start);
-                const uint32_t input_vals0 =
-                    *reinterpret_cast<const uint32_t*>(current_input);
-                current_input += 4;
-                const int32_t filter_vals0 =
-                    *reinterpret_cast<const int32_t*>(current_filter);
-                current_filter += 2;
-                const uint8 input_val0 = input_vals0 & 0xff;
-                const int16 filter_val0 = filter_vals0 & 0xffff;
-                acc += filter_val0 * input_val0;
-                const uint8 input_val1 = (input_vals0 >> 8) & 0xff;
-                const int16 filter_val1 = (filter_vals0 >> 16) & 0xffff;
-                acc += filter_val1 * input_val1;
-
-                const int32_t filter_vals1 =
-                    *reinterpret_cast<const int32_t*>(current_filter);
-                current_filter += 2;
-                const uint8 input_val2 = (input_vals0 >> 16) & 0xff;
-                const int16 filter_val2 = filter_vals1 & 0xffff;
-                acc += filter_val2 * input_val2;
-                const uint8 input_val3 = (input_vals0 >> 24) & 0xff;
-                const int16 filter_val3 = (filter_vals1 >> 16) & 0xffff;
-                acc += filter_val3 * input_val3;
-
-                const uint32_t input_vals1 =
-                    *reinterpret_cast<const uint32_t*>(current_input);
-                const int32_t filter_vals2 =
-                    *reinterpret_cast<const int32_t*>(current_filter);
-                current_filter += 2;
-                const uint8 input_val4 = input_vals1 & 0xff;
-                const int16 filter_val4 = filter_vals2 & 0xffff;
-                acc += filter_val4 * input_val4;
-                const uint8 input_val5 = (input_vals1 >> 8) & 0xff;
-                const int16 filter_val5 = (filter_vals2 >> 16) & 0xffff;
-                acc += filter_val5 * input_val5;
-
-                const int32_t filter_vals3 =
-                    *reinterpret_cast<const int32_t*>(current_filter);
-                const uint8 input_val6 = (input_vals1 >> 16) & 0xff;
-                const int16 filter_val6 = filter_vals3 & 0xffff;
-                acc += filter_val6 * input_val6;
-                const uint8 input_val7 = (input_vals1 >> 24) & 0xff;
-                const int16 filter_val7 = (filter_vals3 >> 16) & 0xffff;
-                acc += filter_val7 * input_val7;
-              } else {
-                const uint8* current_filter =
-                    filter_data +
-                    Offset(filter_shape, 0, filter_y, filter_x_start, oc);
-                for (int filter_x = filter_x_start; filter_x < filter_x_end;
-                     ++filter_x) {
-                  int32 input_val = *current_input;
-                  current_input += input_depth;
-                  int32 filter_val = *current_filter;
-                  current_filter += output_depth;
-                  acc +=
-                      (filter_val + filter_offset) * (input_val + input_offset);
-                }
-              }
-            }
-            if (bias_data) {
-              acc += bias_data[oc];
-            }
-            acc = reference_ops::depthwise_conv::DepthwiseConvRound<
-                DepthwiseConvOutputRounding::kAwayFromZero>(
-                acc, output_multiplier, output_shift);
-            acc += output_offset;
-            acc = std::max(acc, output_activation_min);
-            acc = std::min(acc, output_activation_max);
-            output_data[Offset(output_shape, b, out_y, out_x, oc)] =
-                static_cast<uint8>(acc);
-          }
-        }
-      }
-    }
-  }
-}  // namespace
-
-}  // namespace
-
-void EvalFloat(TfLiteContext* context, TfLiteNode* node,
-               TfLiteDepthwiseConvParams* params, OpData* data,
-               const TfLiteTensor* input, const TfLiteTensor* filter,
-               const TfLiteTensor* bias, TfLiteTensor* output) {
-  float output_activation_min, output_activation_max;
-  CalculateActivationRange(params->activation, &output_activation_min,
-                           &output_activation_max);
-
-  tflite::DepthwiseParams op_params;
-  // Padding type is ignored, but still set.
-  op_params.padding_type = PaddingType::kSame;
-  op_params.padding_values.width = data->padding.width;
-  op_params.padding_values.height = data->padding.height;
-  op_params.stride_width = params->stride_width;
-  op_params.stride_height = params->stride_height;
-  op_params.dilation_width_factor = 1;
-  op_params.dilation_height_factor = 1;
-  op_params.depth_multiplier = params->depth_multiplier;
-  op_params.float_activation_min = output_activation_min;
-  op_params.float_activation_max = output_activation_max;
-
-  tflite::reference_ops::DepthwiseConv(
-      op_params, GetTensorShape(input), GetTensorData<float>(input),
-      GetTensorShape(filter), GetTensorData<float>(filter),
-      GetTensorShape(bias), GetTensorData<float>(bias), GetTensorShape(output),
-      GetTensorData<float>(output));
-}
-
-// TODO(njeff): Optimize for int8 like we do for uint8.
-
-void EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
-                             TfLiteDepthwiseConvParams* params, OpData* data,
-                             const TfLiteTensor* input,
-                             const TfLiteTensor* filter,
-                             const TfLiteTensor* bias, TfLiteTensor* output) {
-  DepthwiseParams op_params;
-  op_params.padding_type = PaddingType::kSame;
-  op_params.padding_values.width = data->padding.width;
-  op_params.padding_values.height = data->padding.height;
-  op_params.stride_width = params->stride_width;
-  op_params.stride_height = params->stride_height;
-  op_params.dilation_width_factor = params->dilation_width_factor;
-  op_params.dilation_height_factor = params->dilation_height_factor;
-  op_params.depth_multiplier = params->depth_multiplier;
-  op_params.input_offset = -input->params.zero_point;
-  op_params.weights_offset = 0;
-  op_params.output_offset = output->params.zero_point;
-  // TODO(b/130439627): Use calculated value for clamping.
-  op_params.quantized_activation_min = std::numeric_limits<int8_t>::min();
-  op_params.quantized_activation_max = std::numeric_limits<int8_t>::max();
-
-  reference_integer_ops::DepthwiseConvPerChannel(
-      op_params, data->per_channel_output_multiplier,
-      data->per_channel_output_shift, GetTensorShape(input),
-      GetTensorData<int8>(input), GetTensorShape(filter),
-      GetTensorData<int8>(filter), GetTensorShape(bias),
-      GetTensorData<int32>(bias), GetTensorShape(output),
-      GetTensorData<int8>(output));
-}
-
-void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
-                   TfLiteDepthwiseConvParams* params, OpData* data,
-                   const TfLiteTensor* input, const TfLiteTensor* filter,
-                   const TfLiteTensor* bias, TfLiteTensor* output) {
-  const int32_t input_offset = -input->params.zero_point;
-  const int32_t filter_offset = -filter->params.zero_point;
-  const int32_t output_offset = output->params.zero_point;
-
-  tflite::DepthwiseParams op_params;
-  // Padding type is ignored, but still set.
-  op_params.padding_type = PaddingType::kSame;
-  op_params.padding_values.width = data->padding.width;
-  op_params.padding_values.height = data->padding.height;
-  op_params.stride_width = params->stride_width;
-  op_params.stride_height = params->stride_height;
-  op_params.dilation_width_factor = 1;
-  op_params.dilation_height_factor = 1;
-  op_params.depth_multiplier = params->depth_multiplier;
-  op_params.quantized_activation_min = data->output_activation_min;
-  op_params.quantized_activation_max = data->output_activation_max;
-  op_params.input_offset = input_offset;
-  op_params.weights_offset = filter_offset;
-  op_params.output_offset = output_offset;
-  op_params.output_multiplier = data->output_multiplier;
-  // Legacy ops used mixed left and right shifts. Now all are +ve-means-left.
-  op_params.output_shift = -data->output_shift;
-
-  // Figure out if we can use the optimized path for this set of parameters.
-  const int filter_width = GetTensorShape(filter).Dims(2);
-  const int input_depth = GetTensorShape(input).Dims(3);
-  const int output_depth = GetTensorShape(filter).Dims(3);
-  const int filter_height = GetTensorShape(filter).Dims(1);
-  const int needed_size =
-      output_depth * filter_width * filter_height * input_depth;
-  bool use_optimized_path = false;
-  if ((filter_width == 8) && (input_offset == 0) && (input_depth == 1) &&
-      (needed_size <= kReshapedFilterDataSize)) {
-    // FIXME(petewarden) - We need a more robust way of handling this, ideally
-    // with an allocation mechanism available through the context API.
-    // Use the address of the node as a proxy for its identity, since we need
-    // to ensure the weight values are consistent between calls, and there's
-    // no easy way to do that quickly other than relying on the identity of
-    // the owning node.
-    static TfLiteNode* initialized_node_address = node;
-    if (initialized_node_address == node) {
-      use_optimized_path = true;
-    } else {
-      static bool has_warned = false;
-      if (!has_warned) {
-        TF_LITE_KERNEL_LOG(
-            context,
-            "Multiple depthwise conv ops match optimization parameters, but "
-            "only the first will use the fast path, because there's only one "
-            "RAM cache available");
-        has_warned = true;
-      }
-    }
-  }
-  if (use_optimized_path) {
-    DepthwiseConvOptimizedForFilterWidthEight(
-        context, op_params, GetTensorShape(input),
-        GetTensorData<uint8_t>(input), GetTensorShape(filter),
-        GetTensorData<uint8_t>(filter), GetTensorShape(bias),
-        GetTensorData<int32_t>(bias), GetTensorShape(output),
-        GetTensorData<uint8_t>(output));
-  } else {
-    tflite::reference_ops::DepthwiseConv(
-        op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
-        GetTensorShape(filter), GetTensorData<uint8_t>(filter),
-        GetTensorShape(bias), GetTensorData<int32_t>(bias),
-        GetTensorShape(output), GetTensorData<uint8_t>(output));
-  }
-}
-
-TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  auto* params =
-      reinterpret_cast<TfLiteDepthwiseConvParams*>(node->builtin_data);
-
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  const TfLiteTensor* filter = GetInput(context, node, kFilterTensor);
-  const TfLiteTensor* bias =
-      (NumInputs(node) == 3) ? GetInput(context, node, kBiasTensor) : nullptr;
-
-  const TfLiteType data_type = input->type;
-  int width = SizeOfDimension(input, 2);
-  int height = SizeOfDimension(input, 1);
-  int filter_width = SizeOfDimension(filter, 2);
-  int filter_height = SizeOfDimension(filter, 1);
-  int out_width = ComputeOutSize(params->padding, width, filter_width,
-                                 params->stride_width);
-  int out_height = ComputeOutSize(params->padding, height, filter_height,
-                                  params->stride_height);
-  OpData data;
-
-  // All per-channel quantized tensors need valid zero point and scale arrays.
-  if (input->type == kTfLiteInt8) {
-    TF_LITE_ENSURE_EQ(context, filter->quantization.type,
-                      kTfLiteAffineQuantization);
-
-    const auto* affine_quantization =
-        reinterpret_cast<TfLiteAffineQuantization*>(
-            filter->quantization.params);
-    TF_LITE_ENSURE(context, affine_quantization);
-    TF_LITE_ENSURE(context, affine_quantization->scale);
-    TF_LITE_ENSURE(context, affine_quantization->zero_point);
-    TF_LITE_ENSURE(
-        context, affine_quantization->scale->size == 1 ||
-                     affine_quantization->scale->size ==
-                         filter->dims->data[kDepthwiseConvQuantizedDimension]);
-    TF_LITE_ENSURE_EQ(context, affine_quantization->scale->size,
-                      affine_quantization->zero_point->size);
-  }
-
-  TF_LITE_ENSURE_STATUS(CalculateOpData(context, node, params, width, height,
-                                        filter_width, filter_height, out_width,
-                                        out_height, data_type, &data));
-
-  // TODO(aselle): Consider whether float conv and quantized conv should be
-  // separate ops to avoid dispatch overhead here.
-  switch (input->type) {  // Already know in/out types are same.
-    case kTfLiteFloat32:
-      EvalFloat(context, node, params, &data, input, filter, bias, output);
-      break;
-    case kTfLiteInt8:
-      EvalQuantizedPerChannel(context, node, params, &data, input, filter, bias,
-                              output);
-      break;
-    case kTfLiteUInt8:
-      EvalQuantized(context, node, params, &data, input, filter, bias, output);
-      break;
-    default:
-      TF_LITE_KERNEL_LOG(context, "Type %s (%d) not supported.",
-                         TfLiteTypeGetName(input->type), input->type);
-      return kTfLiteError;
-  }
-  return kTfLiteOk;
-}
-
-}  // namespace depthwise_conv
-
-TfLiteRegistration* Register_DEPTHWISE_CONV_2D() {
-  static TfLiteRegistration r = {/*init=*/nullptr,
-                                 /*free=*/nullptr,
-                                 /*prepare=*/nullptr,
-                                 /*invoke=*/depthwise_conv::Eval,
-                                 /*profiling_string=*/nullptr,
-                                 /*builtin_code=*/0,
-                                 /*custom_name=*/nullptr,
-                                 /*version=*/0};
-  return &r;
-}
-
-}  // namespace micro
-}  // namespace ops
-}  // namespace tflite
diff --git a/tensorflow/lite/micro/kernels/sub_test.cc b/tensorflow/lite/micro/kernels/sub_test.cc
index d6ab48ead36..b8de6eba453 100644
--- a/tensorflow/lite/micro/kernels/sub_test.cc
+++ b/tensorflow/lite/micro/kernels/sub_test.cc
@@ -201,7 +201,7 @@ TF_LITE_MICRO_TEST(FloatSubActivationRelu1) {
   float output_data[output_dims_count];
   tflite::testing::TestSubFloat(inout_shape, input1_values, inout_shape,
                                 input2_values, inout_shape, golden_values,
-                                kTfLiteActRelu1, output_data);
+                                kTfLiteActReluN1To1, output_data);
 }
 
 TF_LITE_MICRO_TEST(FloatSubVariousInputShapes) {
@@ -313,7 +313,7 @@ TF_LITE_MICRO_TEST(QuantizedSubActivationRelu1Uint8) {
       inout_shape, input1_values, input1_quantized, scales[0], zero_points[0],
       inout_shape, input2_values, input2_quantized, scales[1], zero_points[1],
       inout_shape, golden_values, golden_quantized, scales[2], zero_points[2],
-      kTfLiteActRelu1, output);
+      kTfLiteActReluN1To1, output);
 }
 
 TF_LITE_MICRO_TEST(QuantizedSubActivationRelu1Int8) {
@@ -334,7 +334,7 @@ TF_LITE_MICRO_TEST(QuantizedSubActivationRelu1Int8) {
       inout_shape, input1_values, input1_quantized, scales[0], zero_points[0],
       inout_shape, input2_values, input2_quantized, scales[1], zero_points[1],
       inout_shape, golden_values, golden_quantized, scales[2], zero_points[2],
-      kTfLiteActRelu1, output);
+      kTfLiteActReluN1To1, output);
 }
 
 TF_LITE_MICRO_TEST(QuantizedSubVariousInputShapesUint8) {
diff --git a/tensorflow/lite/micro/memory_arena_threshold_test.cc b/tensorflow/lite/micro/memory_arena_threshold_test.cc
index 58d3eff8df5..b45de85a21b 100644
--- a/tensorflow/lite/micro/memory_arena_threshold_test.cc
+++ b/tensorflow/lite/micro/memory_arena_threshold_test.cc
@@ -41,9 +41,17 @@ constexpr int kKeywordModelNodeAndRegistrationCount = 15;
 
 // NOTE: These values are measured on x86-64:
 // TODO(b/158651472): Consider auditing these values on non-64 bit systems.
+//
+// Run this test with '--copt=-DTF_LITE_MICRO_OPTIMIZED_RUNTIME' to get
+// optimized memory runtime values:
+#ifdef TF_LITE_STATIC_MEMORY
+constexpr int kKeywordModelTotalSize = 18448;
+constexpr int kKeywordModelTailSize = 17776;
+#else
 constexpr int kKeywordModelTotalSize = 21040;
-constexpr int kKeywordModelHeadSize = 672;
 constexpr int kKeywordModelTailSize = 20368;
+#endif
+constexpr int kKeywordModelHeadSize = 672;
 constexpr int kKeywordModelTfLiteTensorVariableBufferDataSize = 10240;
 constexpr int kKeywordModelTfLiteTensorQuantizationDataSize = 1728;
 constexpr int kKeywordModelOpRuntimeDataSize = 148;
@@ -56,9 +64,14 @@ constexpr int kTestConvModelNodeAndRegistrationCount = 7;
 
 // NOTE: These values are measured on x86-64:
 // TODO(b/158651472): Consider auditing these values on non-64 bit systems.
+#ifdef TF_LITE_STATIC_MEMORY
+constexpr int kTestConvModelTotalSize = 10960;
+constexpr int kTestConvModelTailSize = 3216;
+#else
 constexpr int kTestConvModelTotalSize = 11680;
-constexpr int kTestConvModelHeadSize = 7744;
 constexpr int kTestConvModelTailSize = 3936;
+#endif
+constexpr int kTestConvModelHeadSize = 7744;
 constexpr int kTestConvModelTfLiteTensorQuantizationDataSize = 768;
 constexpr int kTestConvModelOpRuntimeDataSize = 136;
 
@@ -81,7 +94,7 @@ void EnsureAllocatedSizeThreshold(const char* allocation_type, size_t actual,
     TF_LITE_MICRO_EXPECT_NEAR(actual, expected, kAllocationThreshold);
     if (actual != expected) {
       TF_LITE_REPORT_ERROR(micro_test::reporter,
-                           "%s threshold failed: %ld != %ld", allocation_type,
+                           "%s threshold failed: %d != %d", allocation_type,
                            actual, expected);
     }
   } else {
@@ -159,7 +172,7 @@ TF_LITE_MICRO_TESTS_BEGIN
 TF_LITE_MICRO_TEST(TestKeywordModelMemoryThreshold) {
   tflite::AllOpsResolver all_ops_resolver;
   tflite::RecordingMicroInterpreter interpreter(
-      tflite::GetModel(g_keyword_scrambled_model_data), &all_ops_resolver,
+      tflite::GetModel(g_keyword_scrambled_model_data), all_ops_resolver,
       keyword_model_tensor_arena, kKeywordModelTensorArenaSize,
       micro_test::reporter);
 
@@ -185,7 +198,7 @@ TF_LITE_MICRO_TEST(TestKeywordModelMemoryThreshold) {
 TF_LITE_MICRO_TEST(TestConvModelMemoryThreshold) {
   tflite::AllOpsResolver all_ops_resolver;
   tflite::RecordingMicroInterpreter interpreter(
-      tflite::GetModel(kTestConvModelData), &all_ops_resolver,
+      tflite::GetModel(kTestConvModelData), all_ops_resolver,
       test_conv_tensor_arena, kTestConvModelArenaSize, micro_test::reporter);
 
   interpreter.AllocateTensors();
diff --git a/tensorflow/lite/micro/micro_allocator.cc b/tensorflow/lite/micro/micro_allocator.cc
index f3b64bc9f39..239a23335a6 100644
--- a/tensorflow/lite/micro/micro_allocator.cc
+++ b/tensorflow/lite/micro/micro_allocator.cc
@@ -166,11 +166,12 @@ class AllocationInfoBuilder {
   //  - If there's no metadata available, offline_planner_offsets is not set
   //  - If there's metadata available, offline_planner_offsets will point to the
   //    first offset in the metadata buffer list.
-  TfLiteStatus GetOfflinePlannedOffsets(const Model* model,
-                                        int32_t** offline_planner_offsets);
+  TfLiteStatus GetOfflinePlannedOffsets(
+      const Model* model, const int32_t** offline_planner_offsets);
 
   // Add allocaiton information for the tensors.
-  TfLiteStatus AddTensors(const SubGraph* subgraph, int32_t* offline_offsets,
+  TfLiteStatus AddTensors(const SubGraph* subgraph,
+                          const int32_t* offline_offsets,
                           TfLiteTensor* runtime_tensors);
 
   // Add allocation information for the scratch buffers.
@@ -206,7 +207,7 @@ TfLiteStatus AllocationInfoBuilder::Allocate() {
 }
 
 TfLiteStatus AllocationInfoBuilder::AddTensors(const SubGraph* subgraph,
-                                               int32_t* offline_offsets,
+                                               const int32_t* offline_offsets,
                                                TfLiteTensor* runtime_tensors) {
   // Set up allocation info for all tensors.
   for (size_t i = 0; i < tensor_count_; ++i) {
@@ -299,7 +300,7 @@ TfLiteStatus AllocationInfoBuilder::AddTensors(const SubGraph* subgraph,
 // |    4    | Arena byte offset of tensor #1 or -1 to allocate at runtime     |
 // | 3+(n-1) | Arena byte offset of tensor #(n-1) or -1 to allocate at runtime |
 TfLiteStatus AllocationInfoBuilder::GetOfflinePlannedOffsets(
-    const Model* model, int32_t** offline_planner_offsets) {
+    const Model* model, const int32_t** offline_planner_offsets) {
   if (model->metadata()) {
     for (size_t i = 0; i < model->metadata()->size(); ++i) {
       auto metadata = model->metadata()->Get(i);
@@ -309,9 +310,11 @@ TfLiteStatus AllocationInfoBuilder::GetOfflinePlannedOffsets(
             model->buffers();
         auto* buffer = (*buffers)[metadata->buffer()];
         auto* array = buffer->data();
-        const uint32_t* metadata_buffer = (uint32_t*)array->data();
-        const size_t nbr_tensors = (size_t)metadata_buffer[2];
-        *offline_planner_offsets = (int32_t*)&metadata_buffer[3];
+        const uint32_t* metadata_buffer =
+            reinterpret_cast<const uint32_t*>(array->data());
+        const size_t nbr_tensors = static_cast<size_t>(metadata_buffer[2]);
+        *offline_planner_offsets =
+            reinterpret_cast<const int32_t*>(&metadata_buffer[3]);
 
         if (tensor_count_ != nbr_tensors) {
           TF_LITE_REPORT_ERROR(reporter_,
@@ -389,28 +392,45 @@ TfLiteStatus CommitPlan(ErrorReporter* error_reporter, MemoryPlanner* planner,
 
 namespace internal {
 
-// Allocate a TfLiteIntArray and copy the contents of a FlatBuffers Vector
-// into it.
-template <class T>
-TfLiteStatus FlatBufferIntArrayToTfLiteIntArray(
+// Handles architecture safe mapping of flatbuffer vectors to a TfLite*Array
+// struct. Matching types are required (e.g. float and TfLiteFloatArray).
+template <typename kFlatBufferVectorType, typename kTfLiteArrayType>
+TfLiteStatus FlatBufferVectorToTfLiteTypeArray(
     SimpleMemoryAllocator* allocator, ErrorReporter* error_reporter,
-    const flatbuffers::Vector<T>* flat_array, TfLiteIntArray** result) {
-  TfLiteIntArray* ret =
-      reinterpret_cast<TfLiteIntArray*>(allocator->AllocateFromTail(
-          TfLiteIntArrayGetSizeInBytes(flat_array->Length()),
-          alignof(TfLiteIntArray)));
-  if (nullptr == ret) {
-    TF_LITE_REPORT_ERROR(
-        error_reporter,
-        "Failed to allocate %d bytes of memory to copy an array.",
-        TfLiteIntArrayGetSizeInBytes(flat_array->Length()));
-    return kTfLiteError;
+    const flatbuffers::Vector<kFlatBufferVectorType>* flatbuffer_array,
+    kTfLiteArrayType** result) {
+  TFLITE_DCHECK(error_reporter != nullptr);
+  TFLITE_DCHECK(flatbuffer_array != nullptr);
+  // TODO(b/159668691): Consider adding type assertion or breaking this function
+  // into multiple functions for each type. std::is_same is c++11 and has a
+  // special updated constructor in c++17 that requires a string argument.
+  if (FLATBUFFERS_LITTLEENDIAN) {
+    // On little-endian machines, TfLite*Array happens to have the same memory
+    // layout as flatbuffers:Vector<kFlatBufferVectorType>, so we can
+    // reinterpret_cast the flatbuffer vector and avoid a copy and malloc.
+    *result = const_cast<kTfLiteArrayType*>(
+        reinterpret_cast<const kTfLiteArrayType*>(flatbuffer_array));
+  } else {
+    // Big-endian architecture can not use the same memory layout as
+    // flatbuffers::Vector<kFlatBufferVectorType>. Allocate from the tail and
+    // copy values from the flatbuffer into the newly allocated chunk.
+    kTfLiteArrayType* array =
+        reinterpret_cast<kTfLiteArrayType*>(allocator->AllocateFromTail(
+            TfLiteIntArrayGetSizeInBytes(flatbuffer_array->Length()),
+            alignof(kTfLiteArrayType)));
+    if (array == nullptr) {
+      TF_LITE_REPORT_ERROR(
+          error_reporter,
+          "Failed to allocate %d bytes of memory to copy an array.",
+          TfLiteIntArrayGetSizeInBytes(flatbuffer_array->Length()));
+      return kTfLiteError;
+    }
+    array->size = flatbuffer_array->Length();
+    for (int i = 0; i < array->size; ++i) {
+      array->data[i] = flatbuffer_array->Get(i);
+    }
+    *result = array;
   }
-  ret->size = flat_array->Length();
-  for (int64_t i = 0; i < static_cast<int64_t>(flat_array->Length()); i++) {
-    ret->data[i] = flat_array->Get(i);
-  }
-  *result = ret;
   return kTfLiteOk;
 }
 
@@ -466,27 +486,17 @@ TfLiteStatus InitializeTfLiteTensorFromFlatbuffer(
   TF_LITE_ENSURE_STATUS(BytesRequiredForTensor(
       flatbuffer_tensor, &result->bytes, &type_size, error_reporter));
 
-  // TODO(b/159043126): Cleanup endian casting by doing all endian casting in
-  // one spot:
   if (flatbuffer_tensor.shape() == nullptr) {
     // flatbuffer_tensor.shape() can return a nullptr in the case of a scalar
     // tensor.
     result->dims = const_cast<TfLiteIntArray*>(&kZeroLengthIntArray);
-  } else if (!FLATBUFFERS_LITTLEENDIAN) {
-    // Big-endian architecture. Copy and byte-swap the little-endian shape
-    // data.
-    TF_LITE_ENSURE_STATUS(FlatBufferIntArrayToTfLiteIntArray(
-        allocator, error_reporter, flatbuffer_tensor.shape(), &(result->dims)));
   } else {
-    // On little-endian machines, TfLiteIntArray happens to have the same
-    // memory layout as flatbuffers:Vector<int>, so we can reinterpret_cast the
-    // tensor shape vector and avoid a copy.
     // TFLM doesn't allow reshaping the tensor which requires dynamic memory
     // allocation so it is safe to drop the const qualifier. In the future, if
     // we really want to update the tensor shape, we can always pass in a new
-    // TfLiteIntArray - especially we have to do so if the dimension is changed.
-    result->dims = const_cast<TfLiteIntArray*>(
-        reinterpret_cast<const TfLiteIntArray*>(flatbuffer_tensor.shape()));
+    // TfLiteIntArray - especially we have to do so if the dimension is
+    TF_LITE_ENSURE_STATUS(FlatBufferVectorToTfLiteTypeArray(
+        allocator, error_reporter, flatbuffer_tensor.shape(), &(result->dims)));
   }
 
   // Copy the quantization information from the serialized data.
@@ -528,9 +538,9 @@ TfLiteStatus InitializeTfLiteTensorFromFlatbuffer(
       return kTfLiteError;
     }
 
-    // TODO(b/159043126): Check for big endian before casting flatbuffer values.
-    quantization->scale = const_cast<TfLiteFloatArray*>(
-        reinterpret_cast<const TfLiteFloatArray*>(src_quantization->scale()));
+    TF_LITE_ENSURE_STATUS(FlatBufferVectorToTfLiteTypeArray(
+        allocator, error_reporter, src_quantization->scale(),
+        &quantization->scale));
 
     quantization->zero_point->size = channels;
     int* zero_point_data = quantization->zero_point->data;
@@ -812,13 +822,13 @@ TfLiteStatus MicroAllocator::PrepareNodeAndRegistrationDataFromFlatbuffer(
                                    (void**)(&builtin_data)));
     }
 
-    // Disregard const qualifier to workaround with existing API.
-    // TODO(b/159043126): Check for big endian before casting flatbuffer values.
-    TfLiteIntArray* inputs_array = const_cast<TfLiteIntArray*>(
-        reinterpret_cast<const TfLiteIntArray*>(op->inputs()));
-    // TODO(b/159043126): Check for big endian before casting flatbuffer values.
-    TfLiteIntArray* outputs_array = const_cast<TfLiteIntArray*>(
-        reinterpret_cast<const TfLiteIntArray*>(op->outputs()));
+    TfLiteIntArray* inputs_array;
+    TF_LITE_ENSURE_STATUS(internal::FlatBufferVectorToTfLiteTypeArray(
+        memory_allocator_, error_reporter_, op->inputs(), &inputs_array));
+
+    TfLiteIntArray* outputs_array;
+    TF_LITE_ENSURE_STATUS(internal::FlatBufferVectorToTfLiteTypeArray(
+        memory_allocator_, error_reporter_, op->outputs(), &outputs_array));
 
     TfLiteNode* node = &(node_and_registrations[i].node);
     *node = {};
@@ -893,7 +903,7 @@ TfLiteStatus MicroAllocator::CommitStaticMemoryPlan(const Model* model,
     TF_LITE_ENSURE_STATUS(
         builder.Init(subgraph->tensors()->size(), scratch_buffer_count_));
 
-    int32_t* offline_planner_offsets = nullptr;
+    const int32_t* offline_planner_offsets = nullptr;
     TF_LITE_ENSURE_STATUS(
         builder.GetOfflinePlannedOffsets(model, &offline_planner_offsets));
     TF_LITE_ENSURE_STATUS(builder.AddTensors(subgraph, offline_planner_offsets,
diff --git a/tensorflow/lite/micro/micro_interpreter.cc b/tensorflow/lite/micro/micro_interpreter.cc
index c20eb1f0984..08556a56a54 100644
--- a/tensorflow/lite/micro/micro_interpreter.cc
+++ b/tensorflow/lite/micro/micro_interpreter.cc
@@ -83,19 +83,23 @@ MicroInterpreter::MicroInterpreter(const Model* model,
       error_reporter_(error_reporter),
       allocator_(*MicroAllocator::Create(tensor_arena, tensor_arena_size,
                                          error_reporter)),
+      tensors_allocated_(false),
+      initialization_status_(kTfLiteError),
       context_helper_(error_reporter_, &allocator_) {
   Init(profiler);
 }
 
 MicroInterpreter::MicroInterpreter(const Model* model,
-                                   const MicroOpResolver* op_resolver,
+                                   const MicroOpResolver& op_resolver,
                                    MicroAllocator* allocator,
                                    ErrorReporter* error_reporter,
                                    tflite::Profiler* profiler)
     : model_(model),
-      op_resolver_(*op_resolver),
+      op_resolver_(op_resolver),
       error_reporter_(error_reporter),
       allocator_(*allocator),
+      tensors_allocated_(false),
+      initialization_status_(kTfLiteError),
       context_helper_(error_reporter_, &allocator_) {
   Init(profiler);
 }
diff --git a/tensorflow/lite/micro/micro_interpreter.h b/tensorflow/lite/micro/micro_interpreter.h
index bbe01fa2934..29377e3b940 100644
--- a/tensorflow/lite/micro/micro_interpreter.h
+++ b/tensorflow/lite/micro/micro_interpreter.h
@@ -82,7 +82,7 @@ class MicroInterpreter {
   // have allocation handled in more than one interpreter or for recording
   // allocations inside the interpreter. The lifetime of the allocator must be
   // as long as that of the interpreter object.
-  MicroInterpreter(const Model* model, const MicroOpResolver* op_resolver,
+  MicroInterpreter(const Model* model, const MicroOpResolver& op_resolver,
                    MicroAllocator* allocator, ErrorReporter* error_reporter,
                    tflite::Profiler* profiler = nullptr);
 
diff --git a/tensorflow/lite/micro/micro_interpreter_test.cc b/tensorflow/lite/micro/micro_interpreter_test.cc
index 079e23d33eb..c577d8cb513 100644
--- a/tensorflow/lite/micro/micro_interpreter_test.cc
+++ b/tensorflow/lite/micro/micro_interpreter_test.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/lite/micro/micro_mutable_op_resolver.h"
 #include "tensorflow/lite/micro/micro_optional_debug_tools.h"
 #include "tensorflow/lite/micro/micro_utils.h"
+#include "tensorflow/lite/micro/recording_micro_allocator.h"
 #include "tensorflow/lite/micro/test_helpers.h"
 #include "tensorflow/lite/micro/testing/micro_test.h"
 
@@ -244,6 +245,7 @@ TF_LITE_MICRO_TEST(TestIncompleteInitialization) {
   tflite::testing::MockOpResolver mock_resolver;
   constexpr size_t allocator_buffer_size = 2048;
   uint8_t allocator_buffer[allocator_buffer_size];
+
   tflite::MicroInterpreter interpreter(model, mock_resolver, allocator_buffer,
                                        allocator_buffer_size,
                                        micro_test::reporter);
@@ -276,4 +278,113 @@ TF_LITE_MICRO_TEST(InterpreterWithProfilerShouldProfileOps) {
 #endif
 }
 
+TF_LITE_MICRO_TEST(TestIncompleteInitializationAllocationsWithSmallArena) {
+  const tflite::Model* model = tflite::testing::GetComplexMockModel();
+  TF_LITE_MICRO_EXPECT_NE(nullptr, model);
+
+  tflite::testing::MockOpResolver mock_resolver;
+  // 1kb is too small for the ComplexMockModel:
+  constexpr size_t allocator_buffer_size = 500;
+  uint8_t allocator_buffer[allocator_buffer_size];
+
+  tflite::RecordingMicroAllocator* allocator =
+      tflite::RecordingMicroAllocator::Create(
+          allocator_buffer, allocator_buffer_size, micro_test::reporter);
+  TF_LITE_MICRO_EXPECT_NE(nullptr, allocator);
+
+  tflite::MicroInterpreter interpreter(model, mock_resolver, allocator,
+                                       micro_test::reporter);
+
+  // Interpreter fails because arena is too small:
+  TF_LITE_MICRO_EXPECT_EQ(interpreter.Invoke(), kTfLiteError);
+
+  // Ensure allocations are zero (ignore tail since some internal structs are
+  // initialized with this space):
+  TF_LITE_MICRO_EXPECT_EQ(
+      0, allocator->GetSimpleMemoryAllocator()->GetHeadUsedBytes());
+  TF_LITE_MICRO_EXPECT_EQ(
+      0, allocator
+             ->GetRecordedAllocation(
+                 tflite::RecordedAllocationType::kTfLiteTensorArray)
+             .used_bytes);
+  TF_LITE_MICRO_EXPECT_EQ(
+      0, allocator
+             ->GetRecordedAllocation(tflite::RecordedAllocationType::
+                                         kTfLiteTensorArrayQuantizationData)
+             .used_bytes);
+  TF_LITE_MICRO_EXPECT_EQ(
+      0,
+      allocator
+          ->GetRecordedAllocation(
+              tflite::RecordedAllocationType::kTfLiteTensorVariableBufferData)
+          .used_bytes);
+  TF_LITE_MICRO_EXPECT_EQ(
+      0,
+      allocator->GetRecordedAllocation(tflite::RecordedAllocationType::kOpData)
+          .used_bytes);
+}
+
+TF_LITE_MICRO_TEST(TestInterpreterDoesNotAllocateUntilInvoke) {
+  const tflite::Model* model = tflite::testing::GetComplexMockModel();
+  TF_LITE_MICRO_EXPECT_NE(nullptr, model);
+
+  tflite::testing::MockOpResolver mock_resolver;
+  constexpr size_t allocator_buffer_size = 1024 * 4;
+  uint8_t allocator_buffer[allocator_buffer_size];
+
+  tflite::RecordingMicroAllocator* allocator =
+      tflite::RecordingMicroAllocator::Create(
+          allocator_buffer, allocator_buffer_size, micro_test::reporter);
+  TF_LITE_MICRO_EXPECT_NE(nullptr, allocator);
+
+  tflite::MicroInterpreter interpreter(model, mock_resolver, allocator,
+                                       micro_test::reporter);
+
+  // Ensure allocations are zero (ignore tail since some internal structs are
+  // initialized with this space):
+  TF_LITE_MICRO_EXPECT_EQ(
+      0, allocator->GetSimpleMemoryAllocator()->GetHeadUsedBytes());
+  TF_LITE_MICRO_EXPECT_EQ(
+      0, allocator
+             ->GetRecordedAllocation(
+                 tflite::RecordedAllocationType::kTfLiteTensorArray)
+             .used_bytes);
+  TF_LITE_MICRO_EXPECT_EQ(
+      0,
+      allocator
+          ->GetRecordedAllocation(
+              tflite::RecordedAllocationType::kTfLiteTensorVariableBufferData)
+          .used_bytes);
+  TF_LITE_MICRO_EXPECT_EQ(
+      0,
+      allocator->GetRecordedAllocation(tflite::RecordedAllocationType::kOpData)
+          .used_bytes);
+
+  TF_LITE_MICRO_EXPECT_EQ(interpreter.Invoke(), kTfLiteOk);
+  allocator->PrintAllocations();
+
+  // Allocation sizes vary based on platform - check that allocations are now
+  // non-zero:
+  TF_LITE_MICRO_EXPECT_GT(
+      allocator->GetSimpleMemoryAllocator()->GetHeadUsedBytes(), 0);
+  TF_LITE_MICRO_EXPECT_GT(
+      allocator
+          ->GetRecordedAllocation(
+              tflite::RecordedAllocationType::kTfLiteTensorArray)
+          .used_bytes,
+      0);
+  TF_LITE_MICRO_EXPECT_GT(
+
+      allocator
+          ->GetRecordedAllocation(
+              tflite::RecordedAllocationType::kTfLiteTensorVariableBufferData)
+          .used_bytes,
+      0);
+  TF_LITE_MICRO_EXPECT_GT(
+
+      allocator->GetRecordedAllocation(tflite::RecordedAllocationType::kOpData)
+          .used_bytes,
+      0);
+}
+
 TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/micro/micro_mutable_op_resolver.h b/tensorflow/lite/micro/micro_mutable_op_resolver.h
index b9ce2bb4bba..8c99f77729d 100644
--- a/tensorflow/lite/micro/micro_mutable_op_resolver.h
+++ b/tensorflow/lite/micro/micro_mutable_op_resolver.h
@@ -104,39 +104,64 @@ class MicroMutableOpResolver : public MicroOpResolver {
     return kTfLiteOk;
   }
 
-  // Registers a Builtin Operator with the MicroOpResolver.
-  //
-  // Only the first call for a given BuiltinOperator enum will be successful.
-  // i.e. if this function is called again for a previously added
-  // BuiltinOperator, the MicroOpResolver will be unchanged and this function
-  // will return kTfLiteError.
-  //
-  // TODO(b/149408647): remove this API once the BuiltinOperator specific Add
-  // functions are fully implemented.
-  TfLiteStatus AddBuiltin(tflite::BuiltinOperator op,
-                          TfLiteRegistration* registration) {
-    TFLITE_DCHECK(registration != nullptr);
-    // For code that is not switched over to the new selective registration of
-    // the parse function, we pass in ParseOpData. This allows for backwards
-    // compatibility.
-    return AddBuiltin(op, *registration, ParseOpData);
-  }
-
   // The Add* functions below add the various Builtin operators to the
   // MicroMutableOpResolver object.
-  //
-  // This API is currently experimental (and only supported for a small subset
-  // of operators). It will soon be preferred over the AddBuiltin function for
-  // the following reason:
-  //  * If all calls to AddBuiltin for an application use this API, the code
-  //    size will be smaller by 5-8K (compared to the using the AddBuiltin
-  //    override).
+
+  TfLiteStatus AddAbs() {
+    return AddBuiltin(BuiltinOperator_ABS, *tflite::ops::micro::Register_ABS(),
+                      ParseAbs);
+  }
+
+  TfLiteStatus AddAdd() {
+    return AddBuiltin(BuiltinOperator_ADD, *tflite::ops::micro::Register_ADD(),
+                      ParseAdd);
+  }
+
+  TfLiteStatus AddArgMax() {
+    return AddBuiltin(BuiltinOperator_ARG_MAX,
+                      *tflite::ops::micro::Register_ARG_MAX(), ParseArgMax);
+  }
+
+  TfLiteStatus AddArgMin() {
+    return AddBuiltin(BuiltinOperator_ARG_MIN,
+                      *tflite::ops::micro::Register_ARG_MIN(), ParseArgMin);
+  }
+
+  TfLiteStatus AddAveragePool2D() {
+    // TODO(b/149408647): Replace ParseOpData with the operator specific parse
+    // function.
+    return AddBuiltin(BuiltinOperator_AVERAGE_POOL_2D,
+                      *tflite::ops::micro::Register_AVERAGE_POOL_2D(),
+                      ParseOpData);
+  }
+
+  TfLiteStatus AddCeil() {
+    // TODO(b/149408647): Replace ParseOpData with the operator specific parse
+    // function.
+    return AddBuiltin(BuiltinOperator_CEIL,
+                      *tflite::ops::micro::Register_CEIL(), ParseOpData);
+  }
+
+  TfLiteStatus AddConcatenation() {
+    // TODO(b/149408647): Replace ParseOpData with the operator specific parse
+    // function.
+    return AddBuiltin(BuiltinOperator_CONCATENATION,
+                      *tflite::ops::micro::Register_CONCATENATION(),
+                      ParseOpData);
+  }
 
   TfLiteStatus AddConv2D() {
     return AddBuiltin(BuiltinOperator_CONV_2D,
                       *tflite::ops::micro::Register_CONV_2D(), ParseConv2D);
   }
 
+  TfLiteStatus AddCos() {
+    // TODO(b/149408647): Replace ParseOpData with the operator specific parse
+    // function.
+    return AddBuiltin(BuiltinOperator_COS, *tflite::ops::micro::Register_COS(),
+                      ParseOpData);
+  }
+
   TfLiteStatus AddDepthwiseConv2D() {
     return AddBuiltin(BuiltinOperator_DEPTHWISE_CONV_2D,
                       *tflite::ops::micro::Register_DEPTHWISE_CONV_2D(),
@@ -149,12 +174,91 @@ class MicroMutableOpResolver : public MicroOpResolver {
                       ParseDequantize);
   }
 
+  TfLiteStatus AddEqual() {
+    // TODO(b/149408647): Replace ParseOpData with the operator specific parse
+    // function.
+    return AddBuiltin(BuiltinOperator_EQUAL,
+                      *tflite::ops::micro::Register_EQUAL(), ParseOpData);
+  }
+
+  TfLiteStatus AddFloor() {
+    // TODO(b/149408647): Replace ParseOpData with the operator specific parse
+    // function.
+    return AddBuiltin(BuiltinOperator_FLOOR,
+                      *tflite::ops::micro::Register_FLOOR(), ParseOpData);
+  }
+
   TfLiteStatus AddFullyConnected() {
     return AddBuiltin(BuiltinOperator_FULLY_CONNECTED,
                       *tflite::ops::micro::Register_FULLY_CONNECTED(),
                       ParseFullyConnected);
   }
 
+  TfLiteStatus AddGreater() {
+    // TODO(b/149408647): Replace ParseOpData with the operator specific parse
+    // function.
+    return AddBuiltin(BuiltinOperator_GREATER,
+                      *tflite::ops::micro::Register_GREATER(), ParseOpData);
+  }
+
+  TfLiteStatus AddGreaterEqual() {
+    // TODO(b/149408647): Replace ParseOpData with the operator specific parse
+    // function.
+    return AddBuiltin(BuiltinOperator_GREATER_EQUAL,
+                      *tflite::ops::micro::Register_GREATER_EQUAL(),
+                      ParseOpData);
+  }
+
+  TfLiteStatus AddL2Normalization() {
+    // TODO(b/149408647): Replace ParseOpData with the operator specific parse
+    // function.
+    return AddBuiltin(BuiltinOperator_L2_NORMALIZATION,
+                      *tflite::ops::micro::Register_L2_NORMALIZATION(),
+                      ParseOpData);
+  }
+
+  TfLiteStatus AddLess() {
+    // TODO(b/149408647): Replace ParseOpData with the operator specific parse
+    // function.
+    return AddBuiltin(BuiltinOperator_LESS,
+                      *tflite::ops::micro::Register_LESS(), ParseOpData);
+  }
+
+  TfLiteStatus AddLessEqual() {
+    // TODO(b/149408647): Replace ParseOpData with the operator specific parse
+    // function.
+    return AddBuiltin(BuiltinOperator_LESS_EQUAL,
+                      *tflite::ops::micro::Register_LESS_EQUAL(), ParseOpData);
+  }
+
+  TfLiteStatus AddLog() {
+    // TODO(b/149408647): Replace ParseOpData with the operator specific parse
+    // function.
+    return AddBuiltin(BuiltinOperator_LOG, *tflite::ops::micro::Register_LOG(),
+                      ParseOpData);
+  }
+
+  TfLiteStatus AddLogicalAnd() {
+    // TODO(b/149408647): Replace ParseOpData with the operator specific parse
+    // function.
+    return AddBuiltin(BuiltinOperator_LOGICAL_AND,
+                      *tflite::ops::micro::Register_LOGICAL_AND(), ParseOpData);
+  }
+
+  TfLiteStatus AddLogicalNot() {
+    // TODO(b/149408647): Replace ParseOpData with the operator specific parse
+    // function.
+    return AddBuiltin(BuiltinOperator_LOGICAL_NOT,
+                      *tflite::ops::micro::Register_LOGICAL_NOT(), ParseOpData);
+  }
+
+  TfLiteStatus AddLogicalOr() {
+    // TODO(b/149408647): Replace ParseOpData with the operator specific parse
+    // function.
+    return AddBuiltin(BuiltinOperator_LOGICAL_OR,
+                      *tflite::ops::micro::Register_LOGICAL_OR(), ParseOpData);
+  }
+
   TfLiteStatus AddLogistic() {
     // TODO(b/149408647): Replace ParseOpData with the operator specific parse
     // function.
@@ -162,26 +266,196 @@ class MicroMutableOpResolver : public MicroOpResolver {
                       *tflite::ops::micro::Register_LOGISTIC(), ParseOpData);
   }
 
+  TfLiteStatus AddMaximum() {
+    // TODO(b/149408647): Replace ParseOpData with the operator specific parse
+    // function.
+    return AddBuiltin(BuiltinOperator_MAXIMUM,
+                      *tflite::ops::micro::Register_MAXIMUM(), ParseOpData);
+  }
+
+  TfLiteStatus AddMaxPool2D() {
+    // TODO(b/149408647): Replace ParseOpData with the operator specific parse
+    // function.
+    return AddBuiltin(BuiltinOperator_MAX_POOL_2D,
+                      *tflite::ops::micro::Register_MAX_POOL_2D(), ParseOpData);
+  }
+
+  TfLiteStatus AddMean() {
+    // TODO(b/149408647): Replace ParseOpData with the operator specific parse
+    // function.
+    return AddBuiltin(BuiltinOperator_MEAN,
+                      *tflite::ops::micro::Register_MEAN(), ParseOpData);
+  }
+
+  TfLiteStatus AddMinimum() {
+    // TODO(b/149408647): Replace ParseOpData with the operator specific parse
+    // function.
+    return AddBuiltin(BuiltinOperator_MINIMUM,
+                      *tflite::ops::micro::Register_MINIMUM(), ParseOpData);
+  }
+
+  TfLiteStatus AddMul() {
+    // TODO(b/149408647): Replace ParseOpData with the operator specific parse
+    // function.
+    return AddBuiltin(BuiltinOperator_MUL, *tflite::ops::micro::Register_MUL(),
+                      ParseOpData);
+  }
+
+  TfLiteStatus AddNeg() {
+    // TODO(b/149408647): Replace ParseOpData with the operator specific parse
+    // function.
+    return AddBuiltin(BuiltinOperator_NEG, *tflite::ops::micro::Register_NEG(),
+                      ParseOpData);
+  }
+
+  TfLiteStatus AddNotEqual() {
+    // TODO(b/149408647): Replace ParseOpData with the operator specific parse
+    // function.
+    return AddBuiltin(BuiltinOperator_NOT_EQUAL,
+                      *tflite::ops::micro::Register_NOT_EQUAL(), ParseOpData);
+  }
+
+  TfLiteStatus AddPack() {
+    // TODO(b/149408647): Replace ParseOpData with the operator specific parse
+    // function.
+    return AddBuiltin(BuiltinOperator_PACK,
+                      *tflite::ops::micro::Register_PACK(), ParseOpData);
+  }
+
+  TfLiteStatus AddPad() {
+    // TODO(b/149408647): Replace ParseOpData with the operator specific parse
+    // function.
+    return AddBuiltin(BuiltinOperator_PAD, *tflite::ops::micro::Register_PAD(),
+                      ParseOpData);
+  }
+
+  TfLiteStatus AddPadV2() {
+    // TODO(b/149408647): Replace ParseOpData with the operator specific parse
+    // function.
+    return AddBuiltin(BuiltinOperator_PADV2,
+                      *tflite::ops::micro::Register_PADV2(), ParseOpData);
+  }
+
+  TfLiteStatus AddPrelu() {
+    // TODO(b/149408647): Replace ParseOpData with the operator specific parse
+    // function.
+    return AddBuiltin(BuiltinOperator_PRELU,
+                      *tflite::ops::micro::Register_PRELU(), ParseOpData);
+  }
+
   TfLiteStatus AddQuantize() {
     return AddBuiltin(BuiltinOperator_QUANTIZE,
                       *tflite::ops::micro::Register_QUANTIZE(), ParseQuantize);
   }
 
+  TfLiteStatus AddRelu() {
+    // TODO(b/149408647): Replace ParseOpData with the operator specific parse
+    // function.
+    return AddBuiltin(BuiltinOperator_RELU,
+                      *tflite::ops::micro::Register_RELU(), ParseOpData);
+  }
+
+  TfLiteStatus AddRelu6() {
+    // TODO(b/149408647): Replace ParseOpData with the operator specific parse
+    // function.
+    return AddBuiltin(BuiltinOperator_RELU6,
+                      *tflite::ops::micro::Register_RELU6(), ParseOpData);
+  }
+
   TfLiteStatus AddReshape() {
     return AddBuiltin(BuiltinOperator_RESHAPE,
                       *tflite::ops::micro::Register_RESHAPE(), ParseReshape);
   }
 
+  TfLiteStatus AddResizeNearestNeighbor() {
+    // TODO(b/149408647): Replace ParseOpData with the operator specific parse
+    // function.
+    return AddBuiltin(BuiltinOperator_RESIZE_NEAREST_NEIGHBOR,
+                      *tflite::ops::micro::Register_RESIZE_NEAREST_NEIGHBOR(),
+                      ParseOpData);
+  }
+
+  TfLiteStatus AddRound() {
+    // TODO(b/149408647): Replace ParseOpData with the operator specific parse
+    // function.
+    return AddBuiltin(BuiltinOperator_ROUND,
+                      *tflite::ops::micro::Register_ROUND(), ParseOpData);
+  }
+
+  TfLiteStatus AddRsqrt() {
+    // TODO(b/149408647): Replace ParseOpData with the operator specific parse
+    // function.
+    return AddBuiltin(BuiltinOperator_RSQRT,
+                      *tflite::ops::micro::Register_RSQRT(), ParseOpData);
+  }
+
+  TfLiteStatus AddSin() {
+    // TODO(b/149408647): Replace ParseOpData with the operator specific parse
+    // function.
+    return AddBuiltin(BuiltinOperator_SIN, *tflite::ops::micro::Register_SIN(),
+                      ParseOpData);
+  }
+
   TfLiteStatus AddSoftmax() {
     return AddBuiltin(BuiltinOperator_SOFTMAX,
                       *tflite::ops::micro::Register_SOFTMAX(), ParseSoftmax);
   }
 
+  TfLiteStatus AddSplit() {
+    // TODO(b/149408647): Replace ParseOpData with the operator specific parse
+    // function.
+    return AddBuiltin(BuiltinOperator_SPLIT,
+                      *tflite::ops::micro::Register_SPLIT(), ParseOpData);
+  }
+
+  TfLiteStatus AddSqrt() {
+    // TODO(b/149408647): Replace ParseOpData with the operator specific parse
+    // function.
+    return AddBuiltin(BuiltinOperator_SQRT,
+                      *tflite::ops::micro::Register_SQRT(), ParseOpData);
+  }
+
+  TfLiteStatus AddSquare() {
+    // TODO(b/149408647): Replace ParseOpData with the operator specific parse
+    // function.
+    return AddBuiltin(BuiltinOperator_SQUARE,
+                      *tflite::ops::micro::Register_SQUARE(), ParseOpData);
+  }
+
+  TfLiteStatus AddStridedSlice() {
+    // TODO(b/149408647): Replace ParseOpData with the operator specific parse
+    // function.
+    return AddBuiltin(BuiltinOperator_STRIDED_SLICE,
+                      *tflite::ops::micro::Register_STRIDED_SLICE(),
+                      ParseOpData);
+  }
+
+  TfLiteStatus AddSub() {
+    // TODO(b/149408647): Replace ParseOpData with the operator specific parse
+    // function.
+    return AddBuiltin(BuiltinOperator_SUB, *tflite::ops::micro::Register_SUB(),
+                      ParseOpData);
+  }
+
   TfLiteStatus AddSvdf() {
     return AddBuiltin(BuiltinOperator_SVDF,
                       *tflite::ops::micro::Register_SVDF(), ParseSvdf);
   }
 
+  TfLiteStatus AddTanh() {
+    // TODO(b/149408647): Replace ParseOpData with the operator specific parse
+    // function.
+    return AddBuiltin(BuiltinOperator_TANH,
+                      *tflite::ops::micro::Register_TANH(), ParseOpData);
+  }
+
+  TfLiteStatus AddUnpack() {
+    // TODO(b/149408647): Replace ParseOpData with the operator specific parse
+    // function.
+    return AddBuiltin(BuiltinOperator_UNPACK,
+                      *tflite::ops::micro::Register_UNPACK(), ParseOpData);
+  }
+
   unsigned int GetRegistrationLength() { return registrations_len_; }
 
  private:
diff --git a/tensorflow/lite/micro/micro_mutable_op_resolver_test.cc b/tensorflow/lite/micro/micro_mutable_op_resolver_test.cc
index ff5dfdf3a9a..fe9c8de5959 100644
--- a/tensorflow/lite/micro/micro_mutable_op_resolver_test.cc
+++ b/tensorflow/lite/micro/micro_mutable_op_resolver_test.cc
@@ -68,14 +68,7 @@ TF_LITE_MICRO_TEST(TestOperations) {
   static TfLiteRegistration r = {tflite::MockInit, tflite::MockFree,
                                  tflite::MockPrepare, tflite::MockInvoke};
 
-  MicroMutableOpResolver<2> micro_op_resolver;
-  TF_LITE_MICRO_EXPECT_EQ(
-      kTfLiteOk, micro_op_resolver.AddBuiltin(BuiltinOperator_CONV_2D, &r));
-
-  // Only one AddBuiltin per operator should return kTfLiteOk.
-  TF_LITE_MICRO_EXPECT_EQ(
-      kTfLiteError, micro_op_resolver.AddBuiltin(BuiltinOperator_CONV_2D, &r));
-
+  MicroMutableOpResolver<1> micro_op_resolver;
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
                           micro_op_resolver.AddCustom("mock_custom", &r));
 
@@ -85,16 +78,10 @@ TF_LITE_MICRO_TEST(TestOperations) {
 
   tflite::MicroOpResolver* resolver = &micro_op_resolver;
 
+  TF_LITE_MICRO_EXPECT_EQ(1, micro_op_resolver.GetRegistrationLength());
+
   const TfLiteRegistration* registration =
-      resolver->FindOp(BuiltinOperator_CONV_2D);
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
-  TF_LITE_MICRO_EXPECT_EQ(nullptr, registration->init(nullptr, nullptr, 0));
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(nullptr, nullptr));
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(nullptr, nullptr));
-
-  TF_LITE_MICRO_EXPECT_EQ(2, micro_op_resolver.GetRegistrationLength());
-
-  registration = resolver->FindOp(BuiltinOperator_RELU);
+      resolver->FindOp(BuiltinOperator_RELU);
   TF_LITE_MICRO_EXPECT_EQ(nullptr, registration);
 
   registration = resolver->FindOp("mock_custom");
@@ -116,12 +103,7 @@ TF_LITE_MICRO_TEST(TestErrorReporting) {
                                  tflite::MockPrepare, tflite::MockInvoke};
 
   tflite::MockErrorReporter mock_reporter;
-  MicroMutableOpResolver<2> micro_op_resolver(&mock_reporter);
-  TF_LITE_MICRO_EXPECT_EQ(false, mock_reporter.HasBeenCalled());
-  mock_reporter.ResetState();
-
-  TF_LITE_MICRO_EXPECT_EQ(
-      kTfLiteOk, micro_op_resolver.AddBuiltin(BuiltinOperator_CONV_2D, &r));
+  MicroMutableOpResolver<1> micro_op_resolver(&mock_reporter);
   TF_LITE_MICRO_EXPECT_EQ(false, mock_reporter.HasBeenCalled());
   mock_reporter.ResetState();
 
@@ -132,10 +114,7 @@ TF_LITE_MICRO_TEST(TestErrorReporting) {
 
   // Attempting to Add more operators than the class template parameter for
   // MicroMutableOpResolver should result in errors.
-  TF_LITE_MICRO_EXPECT_EQ(
-      kTfLiteError, micro_op_resolver.AddBuiltin(BuiltinOperator_RELU, &r));
-  TF_LITE_MICRO_EXPECT_EQ(true, mock_reporter.HasBeenCalled());
-  mock_reporter.ResetState();
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteError, micro_op_resolver.AddRelu());
 
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteError,
                           micro_op_resolver.AddCustom("mock_custom_1", &r));
diff --git a/tensorflow/lite/micro/recording_micro_interpreter.h b/tensorflow/lite/micro/recording_micro_interpreter.h
index dcb0b431f29..eb443fc6fd1 100644
--- a/tensorflow/lite/micro/recording_micro_interpreter.h
+++ b/tensorflow/lite/micro/recording_micro_interpreter.h
@@ -35,7 +35,7 @@ namespace tflite {
 class RecordingMicroInterpreter : public MicroInterpreter {
  public:
   RecordingMicroInterpreter(const Model* model,
-                            const MicroOpResolver* op_resolver,
+                            const MicroOpResolver& op_resolver,
                             uint8_t* tensor_arena, size_t tensor_arena_size,
                             ErrorReporter* error_reporter)
       : MicroInterpreter(model, op_resolver,
diff --git a/tensorflow/lite/micro/tools/ci_build/test_mbed.sh b/tensorflow/lite/micro/tools/ci_build/test_mbed.sh
index a4d47009c93..fa4506fa6b8 100755
--- a/tensorflow/lite/micro/tools/ci_build/test_mbed.sh
+++ b/tensorflow/lite/micro/tools/ci_build/test_mbed.sh
@@ -49,7 +49,7 @@ fi
 
 make -f tensorflow/lite/micro/tools/make/Makefile \
   TARGET=${TARGET} \
-  TAGS="portable_optimized disco_f746ng" \
+  TAGS="disco_f746ng" \
   ${PROJECTS}
 
 readable_run tensorflow/lite/micro/tools/ci_build/install_mbed_cli.sh
diff --git a/tensorflow/lite/micro/tools/make/Makefile b/tensorflow/lite/micro/tools/make/Makefile
index 8b6cba06a0b..a75c59b05c9 100644
--- a/tensorflow/lite/micro/tools/make/Makefile
+++ b/tensorflow/lite/micro/tools/make/Makefile
@@ -75,6 +75,8 @@ TEST_SCRIPT := tensorflow/lite/micro/testing/test_linux_binary.sh
 MICROLITE_LIBS := -lm
 
 # TODO(b/150240249): Add in -fno-rtti once that works for the Xtensa toolchain.
+# TODO(b/159155203): Consider TF_LITE_STATIC_MEMORY to align more with the fact
+# this flag is for an optimized micro runtime.
 CXXFLAGS := -std=c++11 -DTF_LITE_STATIC_MEMORY
 CCFLAGS  := -std=c11   -DTF_LITE_STATIC_MEMORY
 ARFLAGS := -r
diff --git a/tensorflow/lite/micro/tools/make/ext_libs/cmsis.inc b/tensorflow/lite/micro/tools/make/ext_libs/cmsis.inc
index 076c160a845..d445a6f7b37 100644
--- a/tensorflow/lite/micro/tools/make/ext_libs/cmsis.inc
+++ b/tensorflow/lite/micro/tools/make/ext_libs/cmsis.inc
@@ -8,11 +8,6 @@ ifneq ($(filter cmsis-nn,$(ALL_TAGS)),)
     THIRD_PARTY_DOWNLOADS += \
       $(eval $(call add_third_party_download,$(CMSIS_URL),$(CMSIS_MD5),cmsis,patch_cmsis))
 
-    ifneq (,$(filter $(TARGET_ARCH), cortex-m55))
-      CCFLAGS += -DARM_MATH_MVEI
-      CXXFLAGS += -DARM_MATH_MVEI
-    endif
-
     CMSIS_PATH = $(MAKEFILE_DIR)/downloads/cmsis/
 
     # List of files generated with:
diff --git a/tensorflow/lite/micro/tools/make/targets/apollo3evb_makefile.inc b/tensorflow/lite/micro/tools/make/targets/apollo3evb_makefile.inc
index 7d2a0e65b97..dc7a689daed 100644
--- a/tensorflow/lite/micro/tools/make/targets/apollo3evb_makefile.inc
+++ b/tensorflow/lite/micro/tools/make/targets/apollo3evb_makefile.inc
@@ -24,9 +24,6 @@ ifeq ($(TARGET),$(filter $(TARGET),\
 $(MAKEFILE_DIR)/downloads/$(AM_SDK_DEST)/$(SF_BSPS_DEST): $(MAKEFILE_DIR)/downloads/$(AM_SDK_DEST)
   endif
 
-  # Use the faster depthwise conv implementation.
-  ALL_TAGS += portable_optimized
-
   PLATFORM_FLAGS = \
     -DPART_apollo3 \
     -DAM_PACKAGE_BGA \
diff --git a/tensorflow/lite/micro/tools/make/targets/himax_we1_evb_makefile.inc b/tensorflow/lite/micro/tools/make/targets/himax_we1_evb_makefile.inc
new file mode 100644
index 00000000000..60fc2e7cca1
--- /dev/null
+++ b/tensorflow/lite/micro/tools/make/targets/himax_we1_evb_makefile.inc
@@ -0,0 +1,91 @@
+# Settings for himax WE_1 evb.
+ifeq ($(TARGET), himax_we1_evb)
+  
+  CC_TOOL = ccac
+  AR_TOOL = arac
+  CXX_TOOL = ccac
+  LD_TOOL := ccac
+  TARGET_ARCH := arc
+  #ARC_TOOLCHAIN := mwdt 
+
+  BUILD_ARC_MLI := false
+  ARC_MLI_PRE_COMPILED_TARGET := himax_arcem9d_r16
+  
+  include $(MAKEFILE_DIR)/targets/arc/arc_common.inc
+
+  #download SDK & MLI
+  HIMAX_WE1_SDK_NAME := himax_we1_sdk
+  $(eval $(call add_third_party_download,$(HIMAX_WE1_SDK_URL),$(HIMAX_WE1_SDK_MD5),$(HIMAX_WE1_SDK_NAME),))
+
+  #export path of toolchain
+  #export PATH := $(MAKEFILE_DIR)/downloads/$(HIMAX_WE1_SDK_NAME)/image_gen_linux_v3/:$(PATH)
+  
+  TCF_FILE := $(PWD)/$(MAKEFILE_DIR)/downloads/$(HIMAX_WE1_SDK_NAME)/arcem9d_wei_r16.tcf
+  LCF_FILE := $(PWD)/$(MAKEFILE_DIR)/downloads/$(HIMAX_WE1_SDK_NAME)/memory.lcf
+  ARCLIB_FILE :=  $(PWD)/$(MAKEFILE_DIR)/downloads/$(HIMAX_WE1_SDK_NAME)/libembarc.a
+  LIB_HEADER_FILE :=  $(PWD)/$(MAKEFILE_DIR)/downloads/$(HIMAX_WE1_SDK_NAME)/hx_drv_tflm.h
+  
+
+  DEFAULT_HEAPSZ := 8192
+  DEFAULT_STACKSZ := 8192
+
+  TCF_FILE_NAME = $(notdir $(TCF_FILE))
+  ARC_TARGET_FILES_DIRS = $(dir $(TCF_FILE_NAME))
+  MAKE_PROJECT_FILES += $(TCF_FILE_NAME)
+    
+  LCF_FILE_NAME = $(notdir $(LCF_FILE))
+  ARC_TARGET_FILES_DIRS += $(dir $(LCF_FILE))
+  MAKE_PROJECT_FILES += $(LCF_FILE_NAME)
+  
+  ARCLIB_FILE_NAME = $(notdir $(ARCLIB_FILE))
+  ARC_TARGET_FILES_DIRS += $(dir $(ARCLIB_FILE))
+  MAKE_PROJECT_FILES += $(ARCLIB_FILE_NAME)
+  
+  LIB_HEADER_FILE_NAME = $(notdir $(LIB_HEADER_FILE))
+  ARC_TARGET_FILES_DIRS += $(dir $(LIB_HEADER_FILE))
+  MAKE_PROJECT_FILES += $(LIB_HEADER_FILE_NAME)
+  
+  
+    
+  # Need a pointer to the TCF and lcf file
+
+  PLATFORM_FLAGS = \
+    -DNDEBUG \
+    -g \
+    -DCPU_ARC \
+    -Hnosdata \
+    -DTF_LITE_STATIC_MEMORY \
+    -tcf=$(TCF_FILE_NAME) \
+    -Hnocopyr \
+    -Hpurge \
+    -Hcl \
+    -fslp-vectorize-aggressive \
+    -ffunction-sections \
+    -fdata-sections \
+    -tcf_core_config \
+
+  CXXFLAGS += -fno-rtti -DSCRATCH_MEM_Z_SIZE=0x10000 $(PLATFORM_FLAGS)
+  CCFLAGS += $(PLATFORM_FLAGS)
+
+  INCLUDES+= \
+    -I $(MAKEFILE_DIR)/downloads/$(WEI_SDK_NAME) \
+    -I $(MAKEFILE_DIR)/downloads/kissfft
+
+  GENERATED_PROJECT_INCLUDES += \
+    -I. \
+    -I./third_party/kissfft
+
+  LDFLAGS += \
+    -Hheap=8192 \
+    -tcf=$(TCF_FILE_NAME) \
+    -Hnocopyr \
+    -m \
+    -Hldopt=-Coutput=$(TARGET).map \
+    $(LCF_FILE_NAME) \
+    -Hldopt=-Bgrouplib $(ARCLIB_FILE_NAME)
+
+  CXXFLAGS := $(filter-out -std=c++11,$(CXXFLAGS))
+  CCFLAGS := $(filter-out -std=c11,$(CCFLAGS))
+  MICROLITE_LIBS := $(filter-out -lm,$(MICROLITE_LIBS))
+
+endif
diff --git a/tensorflow/lite/micro/tools/make/templates/library.properties b/tensorflow/lite/micro/tools/make/templates/library.properties
index e41fd8d8fbe..6e02748a0b4 100644
--- a/tensorflow/lite/micro/tools/make/templates/library.properties
+++ b/tensorflow/lite/micro/tools/make/templates/library.properties
@@ -7,4 +7,5 @@ paragraph=This library runs TensorFlow machine learning models on microcontrolle
 category=Data Processing
 url=https://www.tensorflow.org/lite/microcontrollers/overview
 ldflags=-lm
-includes=TensorFlowLite.h
\ No newline at end of file
+includes=TensorFlowLite.h
+precompiled=full
diff --git a/tensorflow/lite/micro/tools/make/third_party_downloads.inc b/tensorflow/lite/micro/tools/make/third_party_downloads.inc
index 70698d5a8b0..28d3f3ab529 100644
--- a/tensorflow/lite/micro/tools/make/third_party_downloads.inc
+++ b/tensorflow/lite/micro/tools/make/third_party_downloads.inc
@@ -85,3 +85,8 @@ XTENSA_HIFI4_MD5 :="f234764928f9a42901df33a27e118c8b"
 
 ETHOSU_URL := "https://git.mlplatform.org/ml/ethos-u/ethos-u-core-driver.git/snapshot/ethos-u-core-driver-bcb5aaa99756f1b5c1295b079ebdd60996bc75a5.tar.gz"
 ETHOSU_MD5 := "d2073c8d88fc167fd5c46b5dcda58ea1"
+
+HIMAX_WE1_SDK_URL ="https://www.himax.com.tw/we-i/himax_we1_sdk_v02.zip"
+HIMAX_WE1_SDK_MD5 ="9a4b2f29b16052764e437b64bdcba816"
+                    
+
diff --git a/tensorflow/lite/nnapi/NeuralNetworksTypes.h b/tensorflow/lite/nnapi/NeuralNetworksTypes.h
index 3c30a0479fa..8415df58b8b 100644
--- a/tensorflow/lite/nnapi/NeuralNetworksTypes.h
+++ b/tensorflow/lite/nnapi/NeuralNetworksTypes.h
@@ -46,6 +46,7 @@ enum {
   ANEURALNETWORKS_TENSOR_QUANT16_SYMM = 7,
   ANEURALNETWORKS_TENSOR_QUANT8_SYMM_PER_CHANNEL = 11,
   ANEURALNETWORKS_TENSOR_QUANT8_SYMM = 13,
+  ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED = 14,
 };
 
 /**
diff --git a/tensorflow/lite/nnapi/nnapi_handler.h b/tensorflow/lite/nnapi/nnapi_handler.h
index 00c0b23e3cf..16e1e9fea10 100644
--- a/tensorflow/lite/nnapi/nnapi_handler.h
+++ b/tensorflow/lite/nnapi/nnapi_handler.h
@@ -118,6 +118,11 @@ class NnApiHandler {
            const ANeuralNetworksOperandType* type) { return Value; };
   }
 
+  void StubAddOperandWith(int(stub)(ANeuralNetworksModel* model,
+                                    const ANeuralNetworksOperandType* type)) {
+    nnapi_->ANeuralNetworksModel_addOperand = stub;
+  }
+
   template <int Value>
   void SetOperandValueReturns() {
     nnapi_->ANeuralNetworksModel_setOperandValue =
@@ -268,6 +273,23 @@ class NnApiHandler {
     };
   }
 
+  template <int Value>
+  void SetPriorityReturns() {
+    nnapi_->ANeuralNetworksCompilation_setPriority =
+        [](ANeuralNetworksCompilation* compilation, int priority) -> int {
+      return Value;
+    };
+  }
+
+  template <int Value>
+  void SetOperandSymmPerChannelQuantParamsReturns() {
+    nnapi_->ANeuralNetworksModel_setOperandSymmPerChannelQuantParams =
+        [](ANeuralNetworksModel* model, int32_t index,
+           const ANeuralNetworksSymmPerChannelQuantParams* channelQuant) {
+          return Value;
+        };
+  }
+
   /*
    * Sets the SDK Version in the nnapi structure.
    * If set_unsupported_ops_to_null is set to true, all the functions not
diff --git a/tensorflow/lite/python/BUILD b/tensorflow/lite/python/BUILD
index d25e7d5ef8d..1b64b7d1042 100644
--- a/tensorflow/lite/python/BUILD
+++ b/tensorflow/lite/python/BUILD
@@ -193,9 +193,8 @@ py_test(
     python_version = "PY3",
     srcs_version = "PY2AND3",
     tags = [
-        # TODO(b/111881877): Enable in oss after resolving op registry issues.
-        "no_oss",
-        "no_windows",
+        "no_mac",  # TODO(b/159077703): Enable Python API Flex support on MacOS.
+        "no_windows",  # TODO(b/159077703): Enable Python API Flex support on Windows.
     ],
     deps = [
         ":lite",
diff --git a/tensorflow/lite/python/convert.py b/tensorflow/lite/python/convert.py
index 9ce88ec6c96..8a9f8929b99 100644
--- a/tensorflow/lite/python/convert.py
+++ b/tensorflow/lite/python/convert.py
@@ -94,6 +94,20 @@ class OpsSet(enum.Enum):
   # quantized implementations.
   TFLITE_BUILTINS_INT8 = "TFLITE_BUILTINS_INT8"
 
+  # Convert model using only TensorFlow Lite operations with quantized int8
+  # weights, int16 activations and int64 bias.
+  # Specifying this will throw an error for operations that do not yet have
+  # quantized implementations.
+  # This quantization mode may be used in models for super-resolution,
+  # audio signal processing or image de-noising. It improves accuracy
+  # significantly, but only slightly increases the model size.
+  # WARNING: These ops are currently experimental and have not yet been
+  # finalized.
+  # They are only compatible with CPU execution, and have not been optimized for
+  # production.
+  EXPERIMENTAL_TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8 = \
+    "EXPERIMENTAL_TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8"
+
   def __str__(self):
     return self.value
 
diff --git a/tensorflow/lite/python/lite.py b/tensorflow/lite/python/lite.py
index 93cca1a6af5..334edde94db 100644
--- a/tensorflow/lite/python/lite.py
+++ b/tensorflow/lite/python/lite.py
@@ -193,11 +193,13 @@ class QuantizationMode(object):
   def post_training_int8_no_float(self):
     """Post training int8 quantize, disallow float fallback."""
     return (self._is_int8_target_required() and
+            not self._is_int16x8_target_required() and
             self._representative_dataset is not None)
 
   def post_training_int8_allow_float(self):
     """Post training int8 quantize, allow float fallback."""
     return (self._any_optimization_enabled() and
+            not self._is_int16x8_target_required() and
             self._representative_dataset is not None and
             self._smallest_supported_type() == constants.INT8)
 
@@ -212,6 +214,17 @@ class QuantizationMode(object):
             not self.post_training_dynamic_range_int8() and
             not self.post_training_fp16())
 
+  def post_training_int16x8_no_float(self):
+    """Post training int16x8 quantize, disallow float fallback."""
+    return (not self._is_int8_target_required() and
+            self._is_int16x8_target_required() and
+            not self._is_allow_float() and
+            self._representative_dataset is not None)
+
+  def post_training_int16x8_allow_float(self):
+    """Post training int16x8 quantize, allow float fallback."""
+    return (self._is_int16x8_target_required() and self._is_allow_float())
+
   def post_training_dynamic_range_int8(self):
     """Post training int8 const, on-the-fly int8 quantize of dynamic tensors."""
     # Post-training dynamic range quantization is only enabled if post-training
@@ -231,9 +244,15 @@ class QuantizationMode(object):
     return not (self.post_training_int8_no_float() or
                 self.post_training_int8_allow_float() or
                 self.training_time_int8_allow_float() or
+                self.post_training_int16x8_no_float() or
+                self.post_training_int16x8_allow_float() or
                 self.post_training_dynamic_range_int8() or
                 self.post_training_fp16())
 
+  def activations_type(self):
+    return constants.INT16 if self._is_int16x8_target_required() \
+      else constants.INT8
+
   def converter_flags(self, inference_ty=None, inference_input_ty=None):
     """Flags to the converter."""
     if self.is_post_training_integer_quantize():
@@ -243,7 +262,8 @@ class QuantizationMode(object):
 
     if self.training_time_int8_allow_float():
       return {
-          "inference_type": inference_ty if inference_ty else constants.INT8,
+          "inference_type": inference_ty if inference_ty else \
+            self.activations_type(),
           "inference_input_type":
               inference_input_ty if inference_input_ty else constants.FLOAT,
           "post_training_quantize": False,  # disable dynamic range quantization
@@ -278,16 +298,21 @@ class QuantizationMode(object):
 
     inference_input_type = input_ty if input_ty else constants.FLOAT
     inference_output_type = output_ty if output_ty else constants.FLOAT
-    if self.post_training_int8_no_float():
+
+    if self.post_training_int8_no_float() \
+      or self.post_training_int16x8_no_float():
       return True, {
           "inference_input_type": inference_input_type,
           "inference_output_type": inference_output_type,
+          "activations_type": self.activations_type(),
           "allow_float": False
       }
-    elif self.post_training_int8_allow_float():
+    elif self.post_training_int8_allow_float() \
+      or self.post_training_int16x8_allow_float():
       return True, {
           "inference_input_type": inference_input_type,
           "inference_output_type": inference_output_type,
+          "activations_type": self.activations_type(),
           "allow_float": True
       }
     else:
@@ -322,6 +347,17 @@ class QuantizationMode(object):
         self._target_spec.supported_ops) or
             set(self._target_spec.supported_types) == set([constants.INT8]))
 
+  def _is_int16x8_target_required(self):
+    return bool(
+        set(self._target_spec.supported_ops).intersection([
+            OpsSet.EXPERIMENTAL_TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8
+        ]))
+
+  def _is_allow_float(self):
+    return bool(
+        set(self._target_spec.supported_ops).intersection(
+            [OpsSet.TFLITE_BUILTINS]))
+
   def _any_optimization_enabled(self):
     return bool(
         set(self._optimizations).intersection([
@@ -394,11 +430,13 @@ class TFLiteConverterBase(object):
     return _get_grappler_config(optimizers)
 
   def _calibrate_quantize_model(self, result, inference_input_type,
-                                inference_output_type, allow_float):
+                                inference_output_type, activations_type,
+                                allow_float):
     """Calibrate and quantize the model."""
     if not isinstance(self.representative_dataset, RepresentativeDataset):
       self.representative_dataset = RepresentativeDataset(
           self.representative_dataset)
+
     calibrate_quantize = _calibrator.Calibrator(result)
     if self._experimental_calibrate_only or self._experimental_new_quantizer:
       calibrated = calibrate_quantize.calibrate(
@@ -411,7 +449,7 @@ class TFLiteConverterBase(object):
     else:
       return calibrate_quantize.calibrate_and_quantize(
           self.representative_dataset.input_gen, inference_input_type,
-          inference_output_type, allow_float)
+          inference_output_type, allow_float, activations_type)
 
   def _is_unknown_shapes_allowed(self):
     # Unknown dimensions are only allowed with the new converter.
@@ -690,21 +728,20 @@ class TFLiteKerasModelConverterV2(TFLiteConverterBaseV2):
     self._keras_model = keras_model
     self._trackable_obj = trackable_obj
 
-  def convert(self):
-    """Converts a keras model based on instance variables.
+  def _convert_as_saved_model(self):
+    """Converts a Keras model as a saved model.
 
     Returns:
       The converted data in serialized format.
-
-    Raises:
-      ValueError:
-        Multiple concrete functions are specified.
-        Input shape is not specified.
-        Invalid quantization parameters.
     """
     temp_dir = tempfile.mkdtemp()
     try:
-      self._keras_model.save(temp_dir, save_format="tf")
+      try:
+        self._keras_model.save(temp_dir, save_format="tf")
+      except Exception:  # pylint: disable=broad-except
+        # When storing the given keras model to a saved model is failed, let's
+        # use original keras model conversion pipeline.
+        return None
       self.saved_model_dir = temp_dir
       self._saved_model_tags = set([_tag_constants.SERVING])
       self._saved_model_exported_names = [
@@ -735,6 +772,22 @@ class TFLiteKerasModelConverterV2(TFLiteConverterBaseV2):
     finally:
       shutil.rmtree(temp_dir, True)
 
+  def convert(self):
+    """Converts a keras model based on instance variables.
+
+    Returns:
+      The converted data in serialized format.
+
+    Raises:
+      ValueError:
+        Multiple concrete functions are specified.
+        Input shape is not specified.
+        Invalid quantization parameters.
+    """
+    saved_model_convert_result = self._convert_as_saved_model()
+    if saved_model_convert_result:
+      return saved_model_convert_result
+
     input_signature = None
     # If the model's call is not a `tf.function`, then we need to first get its
     # input signature from `model_input_signature` method. We can't directly
@@ -1473,21 +1526,20 @@ class TFLiteKerasModelConverter(TFLiteConverterBaseV1):
     self._output_tensors = output_tensors
     self._debug_info_func = _build_debug_info_func(sess.graph)
 
-  def convert(self):
-    """Converts a Keras model based on instance variables.
+  def _convert_as_saved_model(self):
+    """Converts a Keras model as a saved model.
 
     Returns:
-      The converted data in serialized format. Either a TFLite Flatbuffer or a
-      Graphviz graph depending on value in `output_format`.
-
-    Raises:
-      ValueError:
-        Input shape is not specified.
-        None value for dimension in input_tensor.
+      The converted data in serialized format.
     """
     temp_dir = tempfile.mkdtemp()
     try:
-      self._keras_model.save(temp_dir, save_format="tf")
+      try:
+        self._keras_model.save(temp_dir, save_format="tf")
+      except Exception:  # pylint: disable=broad-except
+        # When storing the given keras model to a saved model is failed, let's
+        # use original keras model conversion pipeline.
+        return None
       tag_set = set([_tag_constants.SERVING])
       signature_key = _signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
       result = _freeze_saved_model(temp_dir, None, None, None, tag_set,
@@ -1506,6 +1558,22 @@ class TFLiteKerasModelConverter(TFLiteConverterBaseV1):
     finally:
       shutil.rmtree(temp_dir, True)
 
+  def convert(self):
+    """Converts a Keras model based on instance variables.
+
+    Returns:
+      The converted data in serialized format. Either a TFLite Flatbuffer or a
+      Graphviz graph depending on value in `output_format`.
+
+    Raises:
+      ValueError:
+        Input shape is not specified.
+        None value for dimension in input_tensor.
+    """
+    saved_model_convert_result = self._convert_as_saved_model()
+    if saved_model_convert_result:
+      return saved_model_convert_result
+
     return super(TFLiteKerasModelConverter, self).convert()
 
 
@@ -1901,7 +1969,6 @@ class TFLiteConverter(TFLiteFrozenGraphConverter):
     """
     return super(TFLiteConverter, self).convert()
 
-
 @_tf_export(v1=["lite.TocoConverter"])
 class TocoConverter(object):
   """Convert a TensorFlow model into `output_format` using TOCO.
diff --git a/tensorflow/lite/python/lite_constants.py b/tensorflow/lite/python/lite_constants.py
index d43452c775b..4902f23795e 100644
--- a/tensorflow/lite/python/lite_constants.py
+++ b/tensorflow/lite/python/lite_constants.py
@@ -30,6 +30,7 @@ INT64 = dtypes.int64
 STRING = dtypes.string
 QUANTIZED_UINT8 = dtypes.uint8
 INT8 = dtypes.int8
+INT16 = dtypes.int16
 COMPLEX64 = dtypes.complex64
 TENSORFLOW_GRAPHDEF = _toco_flags_pb2.TENSORFLOW_GRAPHDEF
 TFLITE = _toco_flags_pb2.TFLITE
@@ -43,6 +44,7 @@ _tf_export(v1=["lite.constants.STRING"]).export_constant(__name__, "STRING")
 _tf_export(v1=["lite.constants.QUANTIZED_UINT8"]).export_constant(
     __name__, "QUANTIZED_UINT8")
 _tf_export(v1=["lite.constants.INT8"]).export_constant(__name__, "INT8")
+_tf_export(v1=["lite.constants.INT16"]).export_constant(__name__, "INT16")
 _tf_export(v1=["lite.constants.TFLITE"]).export_constant(__name__, "TFLITE")
 _tf_export(v1=["lite.constants.GRAPHVIZ_DOT"]).export_constant(
     __name__, "GRAPHVIZ_DOT")
@@ -62,6 +64,7 @@ _allowed_symbols = [
     "STRING",
     "QUANTIZED_UINT8",
     "INT8",
+    "INT16",
     "COMPLEX64",
     "TENSORFLOW_GRAPHDEF",
     "TFLITE",
diff --git a/tensorflow/lite/python/lite_flex_test.py b/tensorflow/lite/python/lite_flex_test.py
index 26bee206d27..ffc157c2128 100644
--- a/tensorflow/lite/python/lite_flex_test.py
+++ b/tensorflow/lite/python/lite_flex_test.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 from absl.testing import parameterized
+import numpy as np
 
 from tensorflow.lite.python import lite
 from tensorflow.lite.python.interpreter import Interpreter
@@ -41,8 +42,7 @@ class FromSessionTest(test_util.TensorFlowTestCase, parameterized.TestCase):
       ('DisableMlirConverter', False))  # disable mlir
   def testFlexMode(self, enable_mlir):
     with ops.Graph().as_default():
-      in_tensor = array_ops.placeholder(
-          shape=[1, 16, 16, 3], dtype=dtypes.float32)
+      in_tensor = array_ops.placeholder(shape=[1, 4], dtype=dtypes.float32)
       out_tensor = in_tensor + in_tensor
       sess = session.Session()
 
@@ -54,19 +54,22 @@ class FromSessionTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     tflite_model = converter.convert()
     self.assertTrue(tflite_model)
 
-    # Ensures the model contains TensorFlow ops.
-    # TODO(nupurgarg): Check values once there is a Python delegate interface.
+    # Check the model works with TensorFlow ops.
     interpreter = Interpreter(model_content=tflite_model)
-    with self.assertRaises(RuntimeError) as error:
-      interpreter.allocate_tensors()
-    self.assertIn(
-        'Regular TensorFlow ops are not supported by this interpreter.',
-        str(error.exception))
+    interpreter.allocate_tensors()
+    input_details = interpreter.get_input_details()
+    test_input = np.array([[1.0, 2.0, 3.0, 4.0]], dtype=np.float32)
+    interpreter.set_tensor(input_details[0]['index'], test_input)
+    interpreter.invoke()
+
+    output_details = interpreter.get_output_details()
+    expected_output = np.array([[2.0, 4.0, 6.0, 8.0]], dtype=np.float32)
+    output_data = interpreter.get_tensor(output_details[0]['index'])
+    self.assertTrue((expected_output == output_data).all())
 
   def testDeprecatedFlags(self):
     with ops.Graph().as_default():
-      in_tensor = array_ops.placeholder(
-          shape=[1, 16, 16, 3], dtype=dtypes.float32)
+      in_tensor = array_ops.placeholder(shape=[1, 4], dtype=dtypes.float32)
       out_tensor = in_tensor + in_tensor
       sess = session.Session()
 
@@ -83,14 +86,18 @@ class FromSessionTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     tflite_model = converter.convert()
     self.assertTrue(tflite_model)
 
-    # Ensures the model contains TensorFlow ops.
-    # TODO(nupurgarg): Check values once there is a Python delegate interface.
+    # Check the model works with TensorFlow ops.
     interpreter = Interpreter(model_content=tflite_model)
-    with self.assertRaises(RuntimeError) as error:
-      interpreter.allocate_tensors()
-    self.assertIn(
-        'Regular TensorFlow ops are not supported by this interpreter.',
-        str(error.exception))
+    interpreter.allocate_tensors()
+    input_details = interpreter.get_input_details()
+    test_input = np.array([[1.0, 2.0, 3.0, 4.0]], dtype=np.float32)
+    interpreter.set_tensor(input_details[0]['index'], test_input)
+    interpreter.invoke()
+
+    output_details = interpreter.get_output_details()
+    expected_output = np.array([[2.0, 4.0, 6.0, 8.0]], dtype=np.float32)
+    output_data = interpreter.get_tensor(output_details[0]['index'])
+    self.assertTrue((expected_output == output_data).all())
 
 
 class FromConcreteFunctionTest(test_util.TensorFlowTestCase,
@@ -114,14 +121,18 @@ class FromConcreteFunctionTest(test_util.TensorFlowTestCase,
     converter.experimental_new_converter = enable_mlir
     tflite_model = converter.convert()
 
-    # Ensures the model contains TensorFlow ops.
-    # TODO(nupurgarg): Check values once there is a Python delegate interface.
+    # Check the model works with TensorFlow ops.
     interpreter = Interpreter(model_content=tflite_model)
-    with self.assertRaises(RuntimeError) as error:
-      interpreter.allocate_tensors()
-    self.assertIn(
-        'Regular TensorFlow ops are not supported by this interpreter.',
-        str(error.exception))
+    interpreter.allocate_tensors()
+    input_details = interpreter.get_input_details()
+    test_input = np.array([4.0], dtype=np.float32)
+    interpreter.set_tensor(input_details[0]['index'], test_input)
+    interpreter.invoke()
+
+    output_details = interpreter.get_output_details()
+    expected_output = np.array([24.0], dtype=np.float32)
+    output_data = interpreter.get_tensor(output_details[0]['index'])
+    self.assertTrue((expected_output == output_data).all())
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/lite/python/lite_test.py b/tensorflow/lite/python/lite_test.py
index a20d7e83562..478840c5549 100644
--- a/tensorflow/lite/python/lite_test.py
+++ b/tensorflow/lite/python/lite_test.py
@@ -881,9 +881,22 @@ class FromSessionTest(TestModels, parameterized.TestCase):
     self.assertLess(len(quantized_tflite), len(float_tflite))
 
   @parameterized.named_parameters(
-      ('EnableMlirConverter', True),  # enable mlir
-      ('DisableMlirConverter', False))  # disable mlir
-  def testCalibrateAndQuantizeBuiltinInt8(self, enable_mlir):
+      # Quantize model to Int8: with enable mlir
+      ('UseTfliteBuiltinsIntEnableMLIR',
+       [lite.OpsSet.TFLITE_BUILTINS_INT8], True),
+      # Quantize model to Int8: with disable mlir
+      ('UseTfliteBuiltinsIntDisableMLIR',
+       [lite.OpsSet.TFLITE_BUILTINS_INT8], False),
+      # Quantize model to Int16: with disable mlir
+      ('UseTfliteBuiltinsInt16DisableMLIR',
+       [lite.OpsSet.\
+       EXPERIMENTAL_TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8],
+       False),
+      ('UseTfliteBuiltinsInt16EnableMLIR',
+       [lite.OpsSet.\
+       EXPERIMENTAL_TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8],
+       True))
+  def testCalibrateAndQuantizeBuiltinInt(self, supported_ops, enable_mlir):
     with ops.Graph().as_default():
       inp, output, calibration_gen = self._getCalibrationQuantizeModel()
       sess = session.Session()
@@ -899,9 +912,7 @@ class FromSessionTest(TestModels, parameterized.TestCase):
     quantized_converter = lite.TFLiteConverter.from_session(
         sess, [inp], [output])
     quantized_converter.experimental_new_converter = enable_mlir
-    quantized_converter.target_spec.supported_ops = [
-        lite.OpsSet.TFLITE_BUILTINS_INT8
-    ]
+    quantized_converter.target_spec.supported_ops = supported_ops
     quantized_converter.representative_dataset = calibration_gen
     quantized_tflite = quantized_converter.convert()
     self.assertTrue(quantized_tflite)
diff --git a/tensorflow/lite/python/lite_v2_test.py b/tensorflow/lite/python/lite_v2_test.py
index f56f85d0ba4..ea8db15abc2 100644
--- a/tensorflow/lite/python/lite_v2_test.py
+++ b/tensorflow/lite/python/lite_v2_test.py
@@ -769,6 +769,37 @@ class FromKerasModelTest(lite_v2_test_util.ModelTest):
     converter.convert()
     self._assertValidDebugInfo(converter._debug_info)
 
+  @test_util.run_v2_only
+  def testKerasFallbackPath(self):
+    """Test keras model which failed when exporting to the saved model."""
+    input_data = tf.constant(
+        np.array(np.random.random_sample((20)), dtype=np.float32))
+
+    class Model(tf.keras.Model):
+
+      def __init__(self):
+        super(Model, self).__init__()
+        # A None name will cause a failure in exporting to a saved model.
+        self.shared_weights = self.add_weight(
+            name=None,
+            shape=(20, 1),
+            dtype=tf.float32,
+            initializer=tf.random_normal_initializer(
+                mean=0.0, stddev=300**(-0.5)))
+
+      def call(self, x):
+        return tf.add(self.shared_weights, x)
+
+    # Building the model.
+    model = Model()
+    model.compile(optimizer='sgd', loss='mean_squared_error')
+    model.fit(input_data, input_data, epochs=1)
+
+    # Convert model.
+    converter = lite.TFLiteConverterV2.from_keras_model(model)
+    tflite_model = converter.convert()
+    self.assertTrue(tflite_model)
+
 
 class ControlFlowTest(lite_v2_test_util.ModelTest):
 
diff --git a/tensorflow/lite/python/optimize/calibration_wrapper.cc b/tensorflow/lite/python/optimize/calibration_wrapper.cc
index a115e401cfa..4e4584c0fd7 100644
--- a/tensorflow/lite/python/optimize/calibration_wrapper.cc
+++ b/tensorflow/lite/python/optimize/calibration_wrapper.cc
@@ -269,7 +269,8 @@ PyObject* CalibrationWrapper::Calibrate() {
 
 PyObject* CalibrationWrapper::QuantizeModel(int input_py_type,
                                             int output_py_type,
-                                            bool allow_float) {
+                                            bool allow_float,
+                                            int activations_py_type) {
   if (NoOpModel(*model_)) {
     return python_utils::ConvertToPyString(model_str_->data(),
                                            model_str_->size());
@@ -277,6 +278,9 @@ PyObject* CalibrationWrapper::QuantizeModel(int input_py_type,
 
   TfLiteType input_type = python_utils::TfLiteTypeFromPyType(input_py_type);
   TfLiteType output_type = python_utils::TfLiteTypeFromPyType(output_py_type);
+  TfLiteType activations_type =
+      python_utils::TfLiteTypeFromPyType(activations_py_type);
+
   if (input_type == kTfLiteNoType || output_type == kTfLiteNoType) {
     PyErr_SetString(PyExc_ValueError,
                     "Input/output type cannot be kTfLiteNoType");
@@ -286,9 +290,11 @@ PyObject* CalibrationWrapper::QuantizeModel(int input_py_type,
   reader_->AddCalibrationToModel(tflite_model.get(), /*update=*/false);
   flatbuffers::FlatBufferBuilder builder;
   auto status = kTfLiteOk;
-  status = tflite::optimize::QuantizeModel(
+
+  status = tflite::optimize::QuantizeModelAllOperators(
       &builder, tflite_model.get(), TfLiteTypeToSchemaType(input_type),
-      TfLiteTypeToSchemaType(output_type), allow_float, error_reporter_.get());
+      TfLiteTypeToSchemaType(output_type), allow_float,
+      TfLiteTypeToSchemaType(activations_type), error_reporter_.get());
 
   if (status != kTfLiteOk) {
     error_reporter_->exception();
@@ -319,7 +325,7 @@ PyObject* CalibrationWrapper::QuantizeModel(int input_py_type,
   auto status = tflite::optimize::QuantizeModel(
       &builder, tflite_model.get(), TfLiteTypeToSchemaType(input_type),
       TfLiteTypeToSchemaType(output_type), allow_float, {op_name},
-      error_reporter_.get());
+      TensorType_INT8, error_reporter_.get());
   if (status != kTfLiteOk) {
     error_reporter_->exception();
     return nullptr;
diff --git a/tensorflow/lite/python/optimize/calibration_wrapper.h b/tensorflow/lite/python/optimize/calibration_wrapper.h
index fc8d6c1c890..94aa0ed6f7f 100644
--- a/tensorflow/lite/python/optimize/calibration_wrapper.h
+++ b/tensorflow/lite/python/optimize/calibration_wrapper.h
@@ -62,7 +62,7 @@ class CalibrationWrapper {
   PyObject* FeedTensor(PyObject* input_value);
 
   PyObject* QuantizeModel(int input_py_type, int output_py_type,
-                          bool allow_float);
+                          bool allow_float, int activations_py_type);
 
   // Allows quantizing only the operator that produces the tensor with name
   // operator_output_name. (This can be used to help debug.).
diff --git a/tensorflow/lite/python/optimize/calibration_wrapper_pybind11.cc b/tensorflow/lite/python/optimize/calibration_wrapper_pybind11.cc
index 09defb1aed1..3f366615edc 100644
--- a/tensorflow/lite/python/optimize/calibration_wrapper_pybind11.cc
+++ b/tensorflow/lite/python/optimize/calibration_wrapper_pybind11.cc
@@ -43,15 +43,18 @@ PYBIND11_MODULE(_pywrap_tensorflow_lite_calibration_wrapper, m) {
            })
       .def("QuantizeModel",
            [](CalibrationWrapper& self, int input_py_type, int output_py_type,
-              bool allow_float, bool enable_mlir_quantizer) {
-             return tensorflow::PyoOrThrow(self.QuantizeModel(
-                 input_py_type, output_py_type, allow_float));
+              bool allow_float, int activations_py_type,
+              bool enable_mlir_quantizer) {
+             return tensorflow::PyoOrThrow(
+                 self.QuantizeModel(input_py_type, output_py_type, allow_float,
+                                    activations_py_type));
            })
       .def("QuantizeModel",
            [](CalibrationWrapper& self, int input_py_type, int output_py_type,
-              bool allow_float) {
-             return tensorflow::PyoOrThrow(self.QuantizeModel(
-                 input_py_type, output_py_type, allow_float));
+              bool allow_float, int activations_py_type) {
+             return tensorflow::PyoOrThrow(
+                 self.QuantizeModel(input_py_type, output_py_type, allow_float,
+                                    activations_py_type));
            })
       .def("QuantizeModel",
            [](CalibrationWrapper& self, int input_py_type, int output_py_type,
diff --git a/tensorflow/lite/python/optimize/calibrator.py b/tensorflow/lite/python/optimize/calibrator.py
index 8f5fab64ffc..2b08ec690ff 100644
--- a/tensorflow/lite/python/optimize/calibrator.py
+++ b/tensorflow/lite/python/optimize/calibrator.py
@@ -19,6 +19,7 @@ from __future__ import print_function
 
 import numpy as np
 from tensorflow.python.util.lazy_loader import LazyLoader
+from tensorflow.lite.python import lite_constants
 
 # Lazy load since some of the performance benchmark skylark rules
 # break dependencies. Must use double quotes to match code internal rewrite
@@ -59,6 +60,7 @@ class Calibrator(object):
                              input_type,
                              output_type,
                              allow_float,
+                             activations_type=lite_constants.INT8,
                              resize_input=True):
     """Calibrates the model with specified generator and then quantizes it.
 
@@ -73,9 +75,11 @@ class Calibrator(object):
       input_type: A tf.dtype representing the desired real-value input type.
       output_type: A tf.dtype representing the desired real-value output type.
       allow_float: A boolean. False if the resulting model cannot perform float
-        computation, useful when targeting an integer-only backend. If False, an
-        error will be thrown if an operation cannot be quantized, otherwise the
-        model will fallback to float ops.
+                   computation, useful when targeting an integer-only backend.
+                   If False, an error will be thrown if an operation cannot be
+                   quantized, otherwise the model will fallback to float ops.
+      activations_type: A tf.dtype representing the desired type for
+                   activations.
       resize_input: A boolean. True if the shape of the sample data is different
         from the input.
     """
@@ -90,7 +94,8 @@ class Calibrator(object):
       self._calibrator.FeedTensor(sample)
     return self._calibrator.QuantizeModel(
         np.dtype(input_type.as_numpy_dtype()).num,
-        np.dtype(output_type.as_numpy_dtype()).num, allow_float)
+        np.dtype(output_type.as_numpy_dtype()).num, allow_float,
+        np.dtype(activations_type.as_numpy_dtype()).num)
 
   def calibrate_and_quantize_single(self,
                                     dataset_gen,
diff --git a/tensorflow/lite/python/optimize/calibrator_test.py b/tensorflow/lite/python/optimize/calibrator_test.py
index ff7e7009c7b..6590212e25e 100644
--- a/tensorflow/lite/python/optimize/calibrator_test.py
+++ b/tensorflow/lite/python/optimize/calibrator_test.py
@@ -32,7 +32,12 @@ from tensorflow.python.platform import test
 
 class CalibratorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 
-  def test_calibration_with_quantization(self):
+  @parameterized.named_parameters(
+      # Activation type Int8
+      ('UseActivationTypeInt8', constants.INT8),
+      # Activation type Int16
+      ('UseActivationTypeInt16', constants.INT16))
+  def test_calibration_with_quantization(self, activations_type):
     model_path = resource_loader.get_path_to_datafile(
         'test_data/mobilenet_like_model.bin')
     float_model = open(model_path, 'rb').read()
@@ -45,10 +50,16 @@ class CalibratorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 
     quantized_model = quantizer.calibrate_and_quantize(input_gen,
                                                        constants.FLOAT,
-                                                       constants.FLOAT, False)
+                                                       constants.FLOAT, False,
+                                                       activations_type)
     self.assertIsNotNone(quantized_model)
 
-  def test_calibration_with_quantization_allow_float(self):
+  @parameterized.named_parameters(
+      # Activation type Int8
+      ('UseActivationTypeInt8', constants.INT8),
+      # Activation type Int16
+      ('UseActivationTypeInt16', constants.INT16))
+  def test_calibration_with_quantization_allow_float(self, activations_type):
     model_path = resource_loader.get_path_to_datafile(
         'test_data/mobilenet_like_model.bin')
     float_model = open(model_path, 'rb').read()
@@ -61,7 +72,8 @@ class CalibratorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 
     quantized_model = quantizer.calibrate_and_quantize(input_gen,
                                                        constants.FLOAT,
-                                                       constants.FLOAT, True)
+                                                       constants.FLOAT, True,
+                                                       activations_type)
     self.assertIsNotNone(quantized_model)
 
   def test_calibration_with_quantization_single_op(self):
@@ -79,7 +91,13 @@ class CalibratorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
         input_gen, constants.FLOAT, constants.FLOAT, True, 'conv2d_8/BiasAdd')
     self.assertIsNotNone(quantized_model)
 
-  def test_calibration_with_quantization_multiple_inputs(self):
+  @parameterized.named_parameters(
+      # Activation type Int8
+      ('UseActivationTypeInt8 - EnableMlirQuantizer', constants.INT8),
+      # Activation type Int16
+      ('UseActivationTypeInt16 - DisableEnableMlirQuantizer', constants.INT16))
+  def test_calibration_with_quantization_multiple_inputs(
+      self, activations_type):
     # Load multi add model from test data.
     # This model has 4 inputs of size (1, 8, 8, 3).
     model_path = resource_loader.get_path_to_datafile(
@@ -94,7 +112,8 @@ class CalibratorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 
     quantized_model = quantizer.calibrate_and_quantize(input_gen,
                                                        constants.FLOAT,
-                                                       constants.FLOAT, False)
+                                                       constants.FLOAT, False,
+                                                       activations_type)
     self.assertIsNotNone(quantized_model)
 
   def test_invalid_model_buffer(self):
@@ -130,7 +149,8 @@ class CalibratorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 
     with self.assertRaisesRegex(ValueError, 'Size mismatch'):
       quantizer.calibrate_and_quantize(input_gen, constants.FLOAT,
-                                       constants.FLOAT, False, False)
+                                       constants.FLOAT, False, constants.INT8,
+                                       False)
 
   def test_invalid_type_calibrator_gen(self):
     model_path = resource_loader.get_path_to_datafile(
@@ -145,7 +165,7 @@ class CalibratorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 
     with self.assertRaises(ValueError):
       quantizer.calibrate_and_quantize(input_gen, constants.FLOAT,
-                                       constants.FLOAT, False)
+                                       constants.FLOAT, False, constants.INT8)
 
   def test_calibration(self):
     model_path = resource_loader.get_path_to_datafile(
diff --git a/tensorflow/lite/python/util.py b/tensorflow/lite/python/util.py
index 9d1b55e5092..a69f59b2837 100644
--- a/tensorflow/lite/python/util.py
+++ b/tensorflow/lite/python/util.py
@@ -49,6 +49,7 @@ _MAP_TF_TO_TFLITE_TYPES = {
     dtypes.string: _types_pb2.STRING,
     dtypes.uint8: _types_pb2.QUANTIZED_UINT8,
     dtypes.int8: _types_pb2.INT8,
+    dtypes.int16: _types_pb2.QUANTIZED_INT16,
     dtypes.complex64: _types_pb2.COMPLEX64,
     dtypes.bool: _types_pb2.BOOL,
 }
diff --git a/tensorflow/lite/testing/generate_examples_lib.py b/tensorflow/lite/testing/generate_examples_lib.py
index fc92991bd57..fce2beabf45 100644
--- a/tensorflow/lite/testing/generate_examples_lib.py
+++ b/tensorflow/lite/testing/generate_examples_lib.py
@@ -52,7 +52,6 @@ from tensorflow.lite.testing.op_tests.cast import make_cast_tests
 from tensorflow.lite.testing.op_tests.ceil import make_ceil_tests
 from tensorflow.lite.testing.op_tests.concat import make_concat_tests
 from tensorflow.lite.testing.op_tests.constant import make_constant_tests
-from tensorflow.lite.testing.op_tests.control_dep import make_control_dep_tests
 from tensorflow.lite.testing.op_tests.conv import make_conv_tests
 from tensorflow.lite.testing.op_tests.conv2d_transpose import make_conv2d_transpose_tests
 from tensorflow.lite.testing.op_tests.conv_activation import make_conv_relu_tests, make_conv_relu1_tests, make_conv_relu6_tests
diff --git a/tensorflow/lite/testing/op_tests/control_dep.py b/tensorflow/lite/testing/op_tests/control_dep.py
deleted file mode 100644
index bd9e369303b..00000000000
--- a/tensorflow/lite/testing/op_tests/control_dep.py
+++ /dev/null
@@ -1,61 +0,0 @@
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Test configs for control_dep."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import tensorflow.compat.v1 as tf
-from tensorflow.lite.testing.zip_test_utils import create_tensor_data
-from tensorflow.lite.testing.zip_test_utils import ExtraTocoOptions
-from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
-from tensorflow.lite.testing.zip_test_utils import register_make_test_function
-
-TEST_INPUT_DEPTH = 3
-
-
-@register_make_test_function()
-def make_control_dep_tests(options):
-  """Make a set of tests that use control dependencies."""
-
-  test_parameters = [{
-      "input_shape": [[], [1, 1, 1, 1], [1, 15, 14, 1], [3, 15, 14, 3]],
-  }]
-
-  def build_graph(parameters):
-    input_tensor = tf.compat.v1.placeholder(
-        dtype=tf.float32, name="input", shape=parameters["input_shape"])
-    filter_value = tf.zeros((3, 3, TEST_INPUT_DEPTH, 8), tf.float32)
-    assert_op = tf.compat.v1.assert_greater_equal(input_tensor,
-                                                  input_tensor - 1)
-    with tf.control_dependencies([assert_op]):
-      out = tf.nn.conv2d(
-          input_tensor, filter_value, strides=(1, 1, 1, 1), padding="SAME")
-      return [input_tensor], [out]
-
-  def build_inputs(parameters, sess, inputs, outputs):
-    input_values = create_tensor_data(tf.float32, parameters["input_shape"])
-    return [input_values], sess.run(
-        outputs, feed_dict=dict(zip(inputs, [input_values])))
-
-  extra_toco_options = ExtraTocoOptions()
-  extra_toco_options.drop_control_dependency = True
-  make_zip_of_tests(
-      options,
-      test_parameters,
-      build_graph,
-      build_inputs,
-      extra_toco_options,
-      expected_tf_failures=3)
diff --git a/tensorflow/lite/toco/allocate_transient_arrays.cc b/tensorflow/lite/toco/allocate_transient_arrays.cc
index 3ec53c9c2d6..76279f6e62d 100644
--- a/tensorflow/lite/toco/allocate_transient_arrays.cc
+++ b/tensorflow/lite/toco/allocate_transient_arrays.cc
@@ -55,8 +55,8 @@ bool EndsAt(const ArrayLifespan& lifespan, std::size_t op_index) {
 // Helper function for ComputeArrayLifespans: updates one ArrayLifespan for
 // one array for one op.
 void UpdateArrayLifespan(
-    const string& array_name, std::size_t op_index,
-    std::unordered_map<string, ArrayLifespan>* array_lifespans) {
+    const std::string& array_name, std::size_t op_index,
+    std::unordered_map<std::string, ArrayLifespan>* array_lifespans) {
   if (array_lifespans->count(array_name)) {
     auto& lifespan = array_lifespans->at(array_name);
     if (!lifespan.persistent) {
@@ -74,7 +74,7 @@ void UpdateArrayLifespan(
 // Computes the ArrayLifespan for each array.
 void ComputeArrayLifespans(
     const Model& model,
-    std::unordered_map<string, ArrayLifespan>* array_lifespans) {
+    std::unordered_map<std::string, ArrayLifespan>* array_lifespans) {
   CHECK(array_lifespans->empty());
   for (const auto& rnn_state : model.flags.rnn_states()) {
     ArrayLifespan lifespan;
@@ -159,7 +159,8 @@ class Allocator {
 
 // Returns the required transient allocation size (in bytes) for a given array,
 // or 0 if it's not a transient array.
-std::size_t TransientArraySize(const Model& model, const string& array_name,
+std::size_t TransientArraySize(const Model& model,
+                               const std::string& array_name,
                                std::size_t transient_data_alignment) {
   if (!IsAllocatableTransientArray(model, array_name)) {
     return 0;
@@ -191,7 +192,7 @@ std::size_t TransientArraySize(const Model& model, const string& array_name,
 
 // Allocates an array: call this for every array just before the first
 // op where it is used.
-void AllocateTransientArray(const Model& model, const string& array_name,
+void AllocateTransientArray(const Model& model, const std::string& array_name,
                             Allocator* allocator,
                             std::size_t transient_data_alignment) {
   if (!IsAllocatableTransientArray(model, array_name)) {
@@ -206,7 +207,7 @@ void AllocateTransientArray(const Model& model, const string& array_name,
 
 // Deallocates an array: call this for every array just after the last
 // op where it is used.
-void DeallocateTransientArray(const Model& model, const string& array_name,
+void DeallocateTransientArray(const Model& model, const std::string& array_name,
                               Allocator* allocator) {
   if (!IsAllocatableTransientArray(model, array_name)) {
     return;
@@ -216,7 +217,7 @@ void DeallocateTransientArray(const Model& model, const string& array_name,
   allocator->Deallocate(*array->alloc);
 }
 
-void PushBackIfNotFound(const string& s, std::vector<string>* v) {
+void PushBackIfNotFound(const std::string& s, std::vector<std::string>* v) {
   if (std::find(v->begin(), v->end(), s) == v->end()) {
     v->push_back(s);
   }
@@ -227,7 +228,7 @@ void PushBackIfNotFound(const string& s, std::vector<string>* v) {
 void AllocateTransientArrays(Model* model,
                              std::size_t transient_data_alignment) {
   // Precompute the lifespans for all arrays.
-  std::unordered_map<string, ArrayLifespan> array_lifespans;
+  std::unordered_map<std::string, ArrayLifespan> array_lifespans;
   ComputeArrayLifespans(*model, &array_lifespans);
 
   // In case of variable batch, our convention will be to compute the
@@ -250,7 +251,7 @@ void AllocateTransientArrays(Model* model,
 
   // Construct a sorted map of array names, so that other layout engines can
   // match exactly.
-  std::map<string, const Array*> ordered_arrays_map;
+  std::map<std::string, const Array*> ordered_arrays_map;
   for (const auto& pair : model->GetArrayMap()) {
     ordered_arrays_map[pair.first] = pair.second.get();
   }
@@ -258,7 +259,7 @@ void AllocateTransientArrays(Model* model,
   // Allocate persistent arrays (like RNN states). For them, 'transient'
   // is a misnormer, should read 'workspace'.
   for (const auto& array_pair : ordered_arrays_map) {
-    const string& array_name = array_pair.first;
+    const std::string& array_name = array_pair.first;
     auto it = array_lifespans.find(array_name);
     if (it != array_lifespans.end() && it->second.persistent) {
       AllocateTransientArray(*model, array_name, &allocator,
@@ -270,7 +271,7 @@ void AllocateTransientArrays(Model* model,
        op_index++) {
     const auto& op = model->operators[op_index];
     // Allocate those arrays whose lifespan starts exactly here.
-    std::vector<string> arrays_to_allocate;
+    std::vector<std::string> arrays_to_allocate;
     for (const auto& input : op->inputs) {
       if (StartsAt(array_lifespans[input], op_index)) {
         PushBackIfNotFound(input, &arrays_to_allocate);
@@ -281,13 +282,13 @@ void AllocateTransientArrays(Model* model,
         PushBackIfNotFound(output, &arrays_to_allocate);
       }
     }
-    for (const string& array : arrays_to_allocate) {
+    for (const std::string& array : arrays_to_allocate) {
       AllocateTransientArray(*model, array, &allocator,
                              transient_data_alignment);
     }
 
     // Deallocate those arrays whose lifespan ends exactly here.
-    std::vector<string> arrays_to_deallocate;
+    std::vector<std::string> arrays_to_deallocate;
     for (const auto& input : op->inputs) {
       if (EndsAt(array_lifespans[input], op_index)) {
         PushBackIfNotFound(input, &arrays_to_deallocate);
@@ -298,7 +299,7 @@ void AllocateTransientArrays(Model* model,
         PushBackIfNotFound(output, &arrays_to_deallocate);
       }
     }
-    for (const string& array : arrays_to_deallocate) {
+    for (const std::string& array : arrays_to_deallocate) {
       DeallocateTransientArray(*model, array, &allocator);
     }
   }
@@ -309,7 +310,7 @@ void AllocateTransientArrays(Model* model,
   std::size_t optimal_transient_alloc_size = 0;
   std::size_t persistent_alloc_size = 0;
   for (const auto& array_pair : ordered_arrays_map) {
-    const string& array_name = array_pair.first;
+    const std::string& array_name = array_pair.first;
     auto it = array_lifespans.find(array_name);
     if (it != array_lifespans.end() && it->second.persistent) {
       persistent_alloc_size +=
@@ -320,7 +321,7 @@ void AllocateTransientArrays(Model* model,
     // for each operator, compute the sum of the sizes of the array that must
     // be live during the execution of this operator, plus the size of
     // persistent arrays that must be live at all times.
-    std::vector<string> non_persistent_edges;
+    std::vector<std::string> non_persistent_edges;
     for (const auto& input : op->inputs) {
       if (!array_lifespans[input].persistent) {
         PushBackIfNotFound(input, &non_persistent_edges);
@@ -332,7 +333,7 @@ void AllocateTransientArrays(Model* model,
       }
     }
     std::size_t size = persistent_alloc_size;
-    for (const string& edge : non_persistent_edges) {
+    for (const std::string& edge : non_persistent_edges) {
       size += TransientArraySize(*model, edge, transient_data_alignment);
     }
     // The optimal total size is the maximum of all operator-specific sizes.
diff --git a/tensorflow/lite/toco/args.cc b/tensorflow/lite/toco/args.cc
index ce67de900d7..94fd850fdc5 100644
--- a/tensorflow/lite/toco/args.cc
+++ b/tensorflow/lite/toco/args.cc
@@ -94,14 +94,16 @@ bool SplitStructuredLine(absl::string_view line, char delimiter,
 }
 
 inline bool TryStripPrefixString(absl::string_view str,
-                                 absl::string_view prefix, string* result) {
+                                 absl::string_view prefix,
+                                 std::string* result) {
   bool res = absl::ConsumePrefix(&str, prefix);
   result->assign(str.begin(), str.end());
   return res;
 }
 
 inline bool TryStripSuffixString(absl::string_view str,
-                                 absl::string_view suffix, string* result) {
+                                 absl::string_view suffix,
+                                 std::string* result) {
   bool res = absl::ConsumeSuffix(&str, suffix);
   result->assign(str.begin(), str.end());
   return res;
@@ -109,10 +111,10 @@ inline bool TryStripSuffixString(absl::string_view str,
 
 }  // namespace
 
-bool Arg<toco::IntList>::Parse(string text) {
+bool Arg<toco::IntList>::Parse(std::string text) {
   parsed_value_.elements.clear();
   specified_ = true;
-  // strings::Split("") produces {""}, but we need {} on empty input.
+  // absl::StrSplit("") produces {""}, but we need {} on empty input.
   // TODO(aselle): Moved this from elsewhere, but ahentz recommends we could
   // use absl::SplitLeadingDec32Values(text.c_str(), &parsed_values_.elements)
   if (!text.empty()) {
@@ -125,7 +127,7 @@ bool Arg<toco::IntList>::Parse(string text) {
   return true;
 }
 
-bool Arg<toco::StringMapList>::Parse(string text) {
+bool Arg<toco::StringMapList>::Parse(std::string text) {
   parsed_value_.elements.clear();
   specified_ = true;
 
@@ -138,24 +140,24 @@ bool Arg<toco::StringMapList>::Parse(string text) {
   // TODO(aselle): Change argument parsing when absl supports structuredline.
   SplitStructuredLine(text_disposable_copy, ',', "{}", &outer_vector);
   for (const absl::string_view& outer_member_stringpiece : outer_vector) {
-    string outer_member(outer_member_stringpiece);
+    std::string outer_member(outer_member_stringpiece);
     if (outer_member.empty()) {
       continue;
     }
-    string outer_member_copy = outer_member;
+    std::string outer_member_copy = outer_member;
     absl::StripAsciiWhitespace(&outer_member);
     if (!TryStripPrefixString(outer_member, "{", &outer_member)) return false;
     if (!TryStripSuffixString(outer_member, "}", &outer_member)) return false;
-    const std::vector<string> inner_fields_vector =
+    const std::vector<std::string> inner_fields_vector =
         absl::StrSplit(outer_member, ',');
 
-    std::unordered_map<string, string> element;
-    for (const string& member_field : inner_fields_vector) {
-      std::vector<string> outer_member_key_value =
+    std::unordered_map<std::string, std::string> element;
+    for (const std::string& member_field : inner_fields_vector) {
+      std::vector<std::string> outer_member_key_value =
           absl::StrSplit(member_field, ':');
       if (outer_member_key_value.size() != 2) return false;
-      string& key = outer_member_key_value[0];
-      string& value = outer_member_key_value[1];
+      std::string& key = outer_member_key_value[0];
+      std::string& value = outer_member_key_value[1];
       absl::StripAsciiWhitespace(&key);
       absl::StripAsciiWhitespace(&value);
       if (element.count(key) != 0) return false;
diff --git a/tensorflow/lite/toco/args.h b/tensorflow/lite/toco/args.h
index 20fa5ecc20c..e1fe209062e 100644
--- a/tensorflow/lite/toco/args.h
+++ b/tensorflow/lite/toco/args.h
@@ -35,7 +35,7 @@ struct IntList {
   std::vector<int32> elements;
 };
 struct StringMapList {
-  std::vector<std::unordered_map<string, string>> elements;
+  std::vector<std::unordered_map<std::string, std::string>> elements;
 };
 
 // command_line_flags.h don't track whether or not a flag is specified. Arg
@@ -82,13 +82,13 @@ template <>
 class Arg<toco::IntList> final {
  public:
   // Provide default_value() to arg list
-  string default_value() const { return ""; }
+  std::string default_value() const { return ""; }
   // Return true if the command line argument was specified on the command line.
   bool specified() const { return specified_; }
   // Bind the parse member function so tensorflow::Flags can call it.
-  bool Parse(string text);
+  bool Parse(std::string text);
 
-  std::function<bool(string)> bind() {
+  std::function<bool(std::string)> bind() {
     return std::bind(&Arg::Parse, this, std::placeholders::_1);
   }
 
@@ -103,14 +103,14 @@ template <>
 class Arg<toco::StringMapList> final {
  public:
   // Provide default_value() to StringMapList
-  string default_value() const { return ""; }
+  std::string default_value() const { return ""; }
   // Return true if the command line argument was specified on the command line.
   bool specified() const { return specified_; }
   // Bind the parse member function so tensorflow::Flags can call it.
 
-  bool Parse(string text);
+  bool Parse(std::string text);
 
-  std::function<bool(string)> bind() {
+  std::function<bool(std::string)> bind() {
     return std::bind(&Arg::Parse, this, std::placeholders::_1);
   }
 
@@ -123,18 +123,18 @@ class Arg<toco::StringMapList> final {
 
 // Flags that describe a model. See model_cmdline_flags.cc for details.
 struct ParsedModelFlags {
-  Arg<string> input_array;
-  Arg<string> input_arrays;
-  Arg<string> output_array;
-  Arg<string> output_arrays;
-  Arg<string> input_shapes;
+  Arg<std::string> input_array;
+  Arg<std::string> input_arrays;
+  Arg<std::string> output_array;
+  Arg<std::string> output_arrays;
+  Arg<std::string> input_shapes;
   Arg<int> batch_size = Arg<int>(1);
   Arg<float> mean_value = Arg<float>(0.f);
-  Arg<string> mean_values;
+  Arg<std::string> mean_values;
   Arg<float> std_value = Arg<float>(1.f);
-  Arg<string> std_values;
-  Arg<string> input_data_type;
-  Arg<string> input_data_types;
+  Arg<std::string> std_values;
+  Arg<std::string> input_data_type;
+  Arg<std::string> input_data_types;
   Arg<bool> variable_batch = Arg<bool>(false);
   Arg<toco::IntList> input_shape;
   Arg<toco::StringMapList> rnn_states;
@@ -142,44 +142,44 @@ struct ParsedModelFlags {
   Arg<bool> change_concat_input_ranges = Arg<bool>(true);
   // Debugging output options.
   // TODO(benoitjacob): these shouldn't be ModelFlags.
-  Arg<string> graphviz_first_array;
-  Arg<string> graphviz_last_array;
-  Arg<string> dump_graphviz;
+  Arg<std::string> graphviz_first_array;
+  Arg<std::string> graphviz_last_array;
+  Arg<std::string> dump_graphviz;
   Arg<bool> dump_graphviz_video = Arg<bool>(false);
-  Arg<string> conversion_summary_dir;
+  Arg<std::string> conversion_summary_dir;
   Arg<bool> allow_nonexistent_arrays = Arg<bool>(false);
   Arg<bool> allow_nonascii_arrays = Arg<bool>(false);
-  Arg<string> arrays_extra_info_file;
-  Arg<string> model_flags_file;
+  Arg<std::string> arrays_extra_info_file;
+  Arg<std::string> model_flags_file;
 };
 
 // Flags that describe the operation you would like to do (what conversion
 // you want). See toco_cmdline_flags.cc for details.
 struct ParsedTocoFlags {
-  Arg<string> input_file;
-  Arg<string> savedmodel_directory;
-  Arg<string> output_file;
-  Arg<string> input_format = Arg<string>("TENSORFLOW_GRAPHDEF");
-  Arg<string> output_format = Arg<string>("TFLITE");
-  Arg<string> savedmodel_tagset;
+  Arg<std::string> input_file;
+  Arg<std::string> savedmodel_directory;
+  Arg<std::string> output_file;
+  Arg<std::string> input_format = Arg<std::string>("TENSORFLOW_GRAPHDEF");
+  Arg<std::string> output_format = Arg<std::string>("TFLITE");
+  Arg<std::string> savedmodel_tagset;
   // TODO(aselle): command_line_flags  doesn't support doubles
   Arg<float> default_ranges_min = Arg<float>(0.);
   Arg<float> default_ranges_max = Arg<float>(0.);
   Arg<float> default_int16_ranges_min = Arg<float>(0.);
   Arg<float> default_int16_ranges_max = Arg<float>(0.);
-  Arg<string> inference_type;
-  Arg<string> inference_input_type;
+  Arg<std::string> inference_type;
+  Arg<std::string> inference_input_type;
   Arg<bool> drop_fake_quant = Arg<bool>(false);
   Arg<bool> reorder_across_fake_quant = Arg<bool>(false);
   Arg<bool> allow_custom_ops = Arg<bool>(false);
   Arg<bool> allow_dynamic_tensors = Arg<bool>(true);
-  Arg<string> custom_opdefs;
+  Arg<std::string> custom_opdefs;
   Arg<bool> post_training_quantize = Arg<bool>(false);
   Arg<bool> quantize_to_float16 = Arg<bool>(false);
   // Deprecated flags
   Arg<bool> quantize_weights = Arg<bool>(false);
-  Arg<string> input_type;
-  Arg<string> input_types;
+  Arg<std::string> input_type;
+  Arg<std::string> input_types;
   Arg<bool> debug_disable_recurrent_cell_fusion = Arg<bool>(false);
   Arg<bool> drop_control_dependency = Arg<bool>(false);
   Arg<bool> propagate_fake_quant_num_bits = Arg<bool>(false);
diff --git a/tensorflow/lite/toco/dump_graphviz.cc b/tensorflow/lite/toco/dump_graphviz.cc
index 68d3b957129..006d5546c60 100644
--- a/tensorflow/lite/toco/dump_graphviz.cc
+++ b/tensorflow/lite/toco/dump_graphviz.cc
@@ -77,7 +77,9 @@ class Color {
 
   // Returns the string serialization of this color in graphviz format,
   // for use as 'fillcolor' in boxes.
-  string AsHexString() const { return StringF("#%.2X%.2X%.2X", r_, g_, b_); }
+  std::string AsHexString() const {
+    return StringF("#%.2X%.2X%.2X", r_, g_, b_);
+  }
   // The color to use for this node; will be used as 'fillcolor'
   // for its box. See Color::AsHexString. A suitable, different
   // color will be chosen for the 'fontcolor' for the inside text
@@ -85,7 +87,7 @@ class Color {
   // Returns the serialization in graphviz format of a suitable color to use
   // 'fontcolor' in the same boxes. It should black or white, whichever offers
   // the better contrast from AsHexString().
-  string TextColorString() const {
+  std::string TextColorString() const {
     // https://en.wikipedia.org/wiki/Relative_luminance
     const float luminance = 0.2126f * r_ + 0.7152f * g_ + 0.0722f * b_;
     const uint8 l = luminance > 128.f ? 0 : 255;
@@ -96,7 +98,7 @@ class Color {
   uint8 r_ = 0, g_ = 0, b_ = 0;
 };
 
-Color HashStringToColor(string s) {
+Color HashStringToColor(std::string s) {
   // Return a unique color for a name.
   //
   // This function removes Tensorflow anti-collision suffixes (eg "_2"), hashes
@@ -120,8 +122,8 @@ Color HashStringToColor(string s) {
   return Color(color_word);
 }
 
-void GetArrayColorAndShape(const Model& model, const string& array_name,
-                           Color* color, string* shape) {
+void GetArrayColorAndShape(const Model& model, const std::string& array_name,
+                           Color* color, std::string* shape) {
   // All colors in this file are from:
   // https://material.io/guidelines/style/color.html
   // Arrays involved in RNN back-edges have a different color
@@ -167,7 +169,8 @@ void GetArrayColorAndShape(const Model& model, const string& array_name,
   *shape = "box";
 }
 
-string GetArrayCompassPt(const Model& model, const string& array_name) {
+std::string GetArrayCompassPt(const Model& model,
+                              const std::string& array_name) {
   // The "compass point" is the point on the node where edge connections are
   // made. For most arrays we don't care, but input's and outputs look better
   // connected at the tip of the "house" and "invhouse" shapes used. So we
@@ -191,7 +194,7 @@ string GetArrayCompassPt(const Model& model, const string& array_name) {
   return "";
 }
 
-void AppendArrayVal(string* string, Array const& array, int index) {
+void AppendArrayVal(std::string* string, Array const& array, int index) {
   if (array.buffer->type == ArrayDataType::kFloat) {
     const auto& data = array.GetBuffer<ArrayDataType::kFloat>().data;
     if (index >= data.size()) {
@@ -231,10 +234,10 @@ void AppendArrayVal(string* string, Array const& array, int index) {
   }
 }
 
-typedef std::map<string, string> Attributes;
+typedef std::map<std::string, std::string> Attributes;
 
-string AttributesToHtml(Attributes attributes) {
-  string html;
+std::string AttributesToHtml(Attributes attributes) {
+  std::string html;
   for (const auto& attr : attributes) {
     html += R"CODE(<TR><TD CELLPADDING="1" ALIGN="RIGHT">)CODE";
     html += attr.first;
@@ -245,8 +248,8 @@ string AttributesToHtml(Attributes attributes) {
   return html;
 }
 
-string GetArrayLabel(const Model& model, const string& array_id) {
-  string html;
+std::string GetArrayLabel(const Model& model, const std::string& array_id) {
+  std::string html;
 
   // Use HTML-like labels (http://www.graphviz.org/doc/info/shapes.html#html)
   html += "<";
@@ -265,7 +268,7 @@ string GetArrayLabel(const Model& model, const string& array_id) {
   html += R"CODE(<TR><TD COLSPAN="2" ALIGN="CENTER">)CODE";
   html += R"CODE(<FONT POINT-SIZE="16" FACE="Helvetica"><I>)CODE";
   AppendF(&html, R"CODE(%s)CODE",
-          std::vector<string>(absl::StrSplit(array_id, '/')).back());
+          std::vector<std::string>(absl::StrSplit(array_id, '/')).back());
   html += R"CODE(</I></FONT>)CODE";
   html += "</TD></TR>";
 
@@ -371,7 +374,7 @@ Attributes GetOpAttributes(const Model& model, const Operator& op) {
   switch (op.type) {
     case OperatorType::kConv: {
       const auto& conv_op = static_cast<const ConvOperator&>(op);
-      string stride;
+      std::string stride;
       AppendF(&stride, "%d", conv_op.stride_width);
       stride += kUnicodeMult;
       AppendF(&stride, "%d", conv_op.stride_height);
@@ -382,7 +385,7 @@ Attributes GetOpAttributes(const Model& model, const Operator& op) {
     }
     case OperatorType::kDepthwiseConv: {
       const auto& depthconv_op = static_cast<const ConvOperator&>(op);
-      string stride;
+      std::string stride;
       AppendF(&stride, "%d", depthconv_op.stride_width);
       stride += kUnicodeMult;
       AppendF(&stride, "%d", depthconv_op.stride_height);
@@ -426,9 +429,9 @@ Color GetOpColor(const Operator& op) {
   }
 }
 
-string GetOpLabel(const Model& model, const Operator& op) {
+std::string GetOpLabel(const Model& model, const Operator& op) {
   // Use HTML-like labels (http://www.graphviz.org/doc/info/shapes.html#html)
-  string html;
+  std::string html;
   html += "<";
 
   // Begin Table
@@ -462,7 +465,8 @@ string GetOpLabel(const Model& model, const Operator& op) {
   if (op.type == OperatorType::kUnsupported) {
     html += static_cast<const TensorFlowUnsupportedOperator&>(op).tensorflow_op;
   } else {
-    html += string(absl::StripPrefix(OperatorTypeName(op.type), "TensorFlow"));
+    html +=
+        std::string(absl::StripPrefix(OperatorTypeName(op.type), "TensorFlow"));
   }
   html += R"CODE(</B></FONT>)CODE";
   html += "</TD></TR>";
@@ -498,7 +502,7 @@ string GetOpLabel(const Model& model, const Operator& op) {
   return html;
 }
 
-float GetLog2BufferSize(const Model& model, const string& array_id) {
+float GetLog2BufferSize(const Model& model, const std::string& array_id) {
   auto& array = model.GetArray(array_id);
   if (array.has_shape()) {
     int buffer_size = 0;
@@ -510,22 +514,23 @@ float GetLog2BufferSize(const Model& model, const string& array_id) {
   return 0.0f;
 }
 
-string GetOpId(int op_index) { return StringF("op%05d", op_index); }
+std::string GetOpId(int op_index) { return StringF("op%05d", op_index); }
 
-void DumpOperator(const Model& model, string* output_file, int op_index) {
+void DumpOperator(const Model& model, std::string* output_file, int op_index) {
   // Dump node for operator.
   const Operator& op = *model.operators[op_index];
   Color color = GetOpColor(op);
-  string label = GetOpLabel(model, op);
-  string op_id = GetOpId(op_index);
+  std::string label = GetOpLabel(model, op);
+  std::string op_id = GetOpId(op_index);
   AppendF(output_file, kOpNodeFmt, op_id, label, color.AsHexString(),
           color.TextColorString());
 }
 
-void DumpOperatorEdges(const Model& model, string* output_file, int op_index) {
+void DumpOperatorEdges(const Model& model, std::string* output_file,
+                       int op_index) {
   // Inputs
   const Operator& op = *model.operators[op_index];
-  string op_id = GetOpId(op_index);
+  std::string op_id = GetOpId(op_index);
   for (int i = 0; i < op.inputs.size(); i++) {
     const auto& input = op.inputs[i];
     if (!model.HasArray(input)) {
@@ -546,7 +551,7 @@ void DumpOperatorEdges(const Model& model, string* output_file, int op_index) {
       // would otherwise skew the layout.
       weight = 1.0f;
     }
-    string compass_pt = GetArrayCompassPt(model, input);
+    std::string compass_pt = GetArrayCompassPt(model, input);
     AppendF(output_file, kInputEdgeFmt, input, compass_pt, op_id, i, line_width,
             weight);
   }
@@ -563,7 +568,7 @@ void DumpOperatorEdges(const Model& model, string* output_file, int op_index) {
     if (!IsArrayConsumed(model, output)) {
       weight = 1.0f;
     }
-    string compass_pt = GetArrayCompassPt(model, output);
+    std::string compass_pt = GetArrayCompassPt(model, output);
     AppendF(output_file, kOutputEdgeFmt, op_id, i, output, compass_pt,
             line_width, weight);
   }
@@ -572,19 +577,19 @@ void DumpOperatorEdges(const Model& model, string* output_file, int op_index) {
 struct Node {
   Node() : math_ops(0) {}
   // Name used as a key in the model's array map
-  string array_id;
+  std::string array_id;
 
   // Estimated number of math ops incurred by this node (the sum of the op
   // with this array as 1st output, plus all children nodes).
   int64 math_ops;
 
   // A map of child nodes keyed by name.
-  std::map<const string, std::unique_ptr<Node>> children;
+  std::map<const std::string, std::unique_ptr<Node>> children;
 };
 
-string GetSubgraphLabel(Node const& node, const string& subgraph) {
+std::string GetSubgraphLabel(Node const& node, const std::string& subgraph) {
   // Use HTML-like labels (http://www.graphviz.org/doc/info/shapes.html#html)
-  string html;
+  std::string html;
   html += "<";
 
   // Begin Table
@@ -613,19 +618,19 @@ string GetSubgraphLabel(Node const& node, const string& subgraph) {
   return html;
 }
 
-void DumpSubgraphHeader(string* output_file, Node const& node,
-                        const string& node_name) {
+void DumpSubgraphHeader(std::string* output_file, Node const& node,
+                        const std::string& node_name) {
   Color color = HashStringToColor(node_name);
-  string label = GetSubgraphLabel(node, node_name);
+  std::string label = GetSubgraphLabel(node, node_name);
   AppendF(output_file, kSubgraphFmt, node_name, color.AsHexString(), label);
 }
 
-void DumpArray(const Model& model, string* output_file,
-               const string& array_id) {
+void DumpArray(const Model& model, std::string* output_file,
+               const std::string& array_id) {
   Color color;
-  string shape;
+  std::string shape;
   GetArrayColorAndShape(model, array_id, &color, &shape);
-  string label = GetArrayLabel(model, array_id);
+  std::string label = GetArrayLabel(model, array_id);
   AppendF(output_file, kArrayNodeFmt, array_id, label, array_id, shape,
           color.AsHexString(), color.TextColorString());
 
@@ -638,8 +643,8 @@ void DumpArray(const Model& model, string* output_file,
   }
 }
 
-void DumpNode(const Model& model, string* output_file, const string& node_name,
-              Node const& node) {
+void DumpNode(const Model& model, std::string* output_file,
+              const std::string& node_name, Node const& node) {
   bool not_root = !node_name.empty();
   if (not_root) {
     DumpSubgraphHeader(output_file, node, node_name);
@@ -662,7 +667,7 @@ void DumpNode(const Model& model, string* output_file, const string& node_name,
   }
 }
 
-int64 GetArithmeticOpsCount(const Model& model, const string& array_id) {
+int64 GetArithmeticOpsCount(const Model& model, const std::string& array_id) {
   for (const auto& op : model.operators) {
     if (!op->outputs.empty() && op->outputs[0] == array_id) {
       int64 count;
@@ -676,15 +681,15 @@ int64 GetArithmeticOpsCount(const Model& model, const string& array_id) {
   return 0;
 }
 
-void InsertNode(const Model& model, const string& array_id, Node* node,
-                std::vector<string> prefixes, int64* math_ops) {
+void InsertNode(const Model& model, const std::string& array_id, Node* node,
+                std::vector<std::string> prefixes, int64* math_ops) {
   if (prefixes.empty()) {
     // Base case: store array in this node.
     node->array_id = array_id;
     *math_ops = GetArithmeticOpsCount(model, array_id);
   } else {
     // Insert into the sub-tree for that prefix.
-    string prefix = prefixes.back();
+    std::string prefix = prefixes.back();
     prefixes.pop_back();
     if (node->children.count(prefix) == 0) {
       // Create a new node if this prefix is unseen.
@@ -700,16 +705,16 @@ void InsertNode(const Model& model, const string& array_id, Node* node,
 void BuildArrayTree(const Model& model, Node* tree) {
   // Delimit array names by path "/", then place into a tree based on this path.
   for (const auto& array_id : model.GetArrayMap()) {
-    std::vector<string> prefixes = absl::StrSplit(array_id.first, '/');
+    std::vector<std::string> prefixes = absl::StrSplit(array_id.first, '/');
     std::reverse(prefixes.begin(), prefixes.end());
     int64 math_ops;  // Temporary storage for math ops used during recursion.
     InsertNode(model, array_id.first, tree, prefixes, &math_ops);
   }
 }
 
-string GetGraphLabel(const Model& model, const string& graph_name) {
+std::string GetGraphLabel(const Model& model, const std::string& graph_name) {
   // Use HTML-like labels (http://www.graphviz.org/doc/info/shapes.html#html)
-  string html;
+  std::string html;
   html += "<";
 
   // Begin Table
@@ -753,8 +758,8 @@ string GetGraphLabel(const Model& model, const string& graph_name) {
 }
 }  // namespace
 
-void DumpGraphviz(const Model& model, string* output_file,
-                  const string& graph_name) {
+void DumpGraphviz(const Model& model, std::string* output_file,
+                  const std::string& graph_name) {
   // Start graphviz format
   AppendF(output_file, kGraphFmt, GetGraphLabel(model, graph_name));
 
diff --git a/tensorflow/lite/toco/dump_graphviz.h b/tensorflow/lite/toco/dump_graphviz.h
index 9bb74dac3f8..0e847896552 100644
--- a/tensorflow/lite/toco/dump_graphviz.h
+++ b/tensorflow/lite/toco/dump_graphviz.h
@@ -21,8 +21,8 @@ limitations under the License.
 
 namespace toco {
 
-void DumpGraphviz(const Model& model, string* output_file_contents,
-                  const string& graph_name);
+void DumpGraphviz(const Model& model, std::string* output_file_contents,
+                  const std::string& graph_name);
 
 }  // namespace toco
 
diff --git a/tensorflow/lite/toco/export_tensorflow.cc b/tensorflow/lite/toco/export_tensorflow.cc
index ec3fb386d10..7ecf6cc7d44 100644
--- a/tensorflow/lite/toco/export_tensorflow.cc
+++ b/tensorflow/lite/toco/export_tensorflow.cc
@@ -49,7 +49,7 @@ namespace toco {
 namespace {
 
 tensorflow::DataType GetTensorFlowDataType(ArrayDataType data_type,
-                                           const string& error_location) {
+                                           const std::string& error_location) {
   switch (data_type) {
     case ArrayDataType::kBool:
       return tensorflow::DT_BOOL;
@@ -74,12 +74,12 @@ tensorflow::DataType GetTensorFlowDataType(ArrayDataType data_type,
 }
 
 tensorflow::DataType GetTensorFlowDataTypeForOp(ArrayDataType data_type,
-                                                const string& op_name) {
+                                                const std::string& op_name) {
   return GetTensorFlowDataType(data_type, "op '" + op_name + "'");
 }
 
 tensorflow::DataType GetTensorFlowDataType(const Model& model,
-                                           const string& array_name) {
+                                           const std::string& array_name) {
   return GetTensorFlowDataType(model.GetArray(array_name).data_type,
                                "array '" + array_name + "'");
 }
@@ -113,8 +113,8 @@ void ExportFloatArray(const Shape& input_shape, const float* input_data,
     }
   }
   output_tensor->set_tensor_content(
-      string(reinterpret_cast<const char*>(input_data),
-             sizeof(*input_data) * input_flat_size));
+      std::string(reinterpret_cast<const char*>(input_data),
+                  sizeof(*input_data) * input_flat_size));
 }
 
 void ExportFloatArray(AxesOrder input_axes_order, const Shape& input_shape,
@@ -137,7 +137,7 @@ void ExportFloatArray(AxesOrder input_axes_order, const Shape& input_shape,
                    legacy_scalar_policy);
 }
 
-bool HasAlreadyExportedConst(const string& name,
+bool HasAlreadyExportedConst(const std::string& name,
                              const GraphDef& tensorflow_graph) {
   for (const auto& node : tensorflow_graph.node()) {
     if (node.op() == "Const" && node.name() == name) {
@@ -147,7 +147,7 @@ bool HasAlreadyExportedConst(const string& name,
   return false;
 }
 
-void ConvertFloatTensorConst(const string& name, const Shape& input_shape,
+void ConvertFloatTensorConst(const std::string& name, const Shape& input_shape,
                              const float* input_data,
                              AxesOrder input_axes_order,
                              AxesOrder output_axes_order,
@@ -165,7 +165,7 @@ void ConvertFloatTensorConst(const string& name, const Shape& input_shape,
                    tensor, legacy_scalar_policy);
 }
 
-void ConvertFloatTensorConst(const string& name, const Shape& input_shape,
+void ConvertFloatTensorConst(const std::string& name, const Shape& input_shape,
                              const float* input_data,
                              AxesOrder input_axes_order,
                              AxesOrder output_axes_order,
@@ -175,7 +175,7 @@ void ConvertFloatTensorConst(const string& name, const Shape& input_shape,
                           LegacyScalarPolicy::kAvoidLegacyScalars);
 }
 
-void ConvertFloatTensorConst(const Model& model, const string& name,
+void ConvertFloatTensorConst(const Model& model, const std::string& name,
                              AxesOrder input_axes_order,
                              AxesOrder output_axes_order,
                              GraphDef* tensorflow_graph) {
@@ -193,7 +193,7 @@ void ConvertFloatTensorConst(const Model& model, const string& name,
                           output_axes_order, tensorflow_graph);
 }
 
-void ConvertFloatTensorConst(const Model& model, const string& name,
+void ConvertFloatTensorConst(const Model& model, const std::string& name,
                              GraphDef* tensorflow_graph) {
   if (HasAlreadyExportedConst(name, *tensorflow_graph)) {
     return;
@@ -214,7 +214,7 @@ void ConvertFloatTensorConst(const Model& model, const string& name,
                    LegacyScalarPolicy::kAvoidLegacyScalars);
 }
 
-void ConvertBoolTensorConst(const Model& model, const string& name,
+void ConvertBoolTensorConst(const Model& model, const std::string& name,
                             GraphDef* tensorflow_graph) {
   if (HasAlreadyExportedConst(name, *tensorflow_graph)) {
     return;
@@ -238,7 +238,7 @@ void ConvertBoolTensorConst(const Model& model, const string& name,
   }
 }
 
-void ConvertIntTensorConst(const Model& model, const string& name,
+void ConvertIntTensorConst(const Model& model, const std::string& name,
                            GraphDef* tensorflow_graph) {
   if (HasAlreadyExportedConst(name, *tensorflow_graph)) {
     return;
@@ -262,7 +262,8 @@ void ConvertIntTensorConst(const Model& model, const string& name,
   }
 }
 
-void CreateIntTensorConst(const string& name, const std::vector<int32>& data,
+void CreateIntTensorConst(const std::string& name,
+                          const std::vector<int32>& data,
                           const std::vector<int32>& shape,
                           GraphDef* tensorflow_graph) {
   if (HasAlreadyExportedConst(name, *tensorflow_graph)) {
@@ -286,7 +287,7 @@ void CreateIntTensorConst(const string& name, const std::vector<int32>& data,
   CHECK_EQ(num_elements, data.size());
 }
 
-void ConvertComplex64TensorConst(const Model& model, const string& name,
+void ConvertComplex64TensorConst(const Model& model, const std::string& name,
                                  GraphDef* tensorflow_graph) {
   if (HasAlreadyExportedConst(name, *tensorflow_graph)) {
     return;
@@ -311,7 +312,7 @@ void ConvertComplex64TensorConst(const Model& model, const string& name,
   }
 }
 
-void CreateMatrixShapeTensorConst(const string& name, int rows, int cols,
+void CreateMatrixShapeTensorConst(const std::string& name, int rows, int cols,
                                   GraphDef* tensorflow_graph) {
   if (HasAlreadyExportedConst(name, *tensorflow_graph)) {
     return;
@@ -324,12 +325,12 @@ void CreateMatrixShapeTensorConst(const string& name, int rows, int cols,
   tensor->set_dtype(DT_INT32);
   const int32 data[2] = {cols, rows};
   tensor->set_tensor_content(
-      string(reinterpret_cast<const char*>(data), sizeof(data)));
+      std::string(reinterpret_cast<const char*>(data), sizeof(data)));
   auto* shape = tensor->mutable_tensor_shape();
   shape->add_dim()->set_size(2);
 }
 
-void CreateDummyConcatDimTensorConst(const string& name, int dim,
+void CreateDummyConcatDimTensorConst(const std::string& name, int dim,
                                      GraphDef* tensorflow_graph) {
   if (HasAlreadyExportedConst(name, *tensorflow_graph)) {
     return;
@@ -343,7 +344,7 @@ void CreateDummyConcatDimTensorConst(const string& name, int dim,
   tensor->add_int_val(dim);
 }
 
-void CreateReshapeShapeTensorConst(const string& name,
+void CreateReshapeShapeTensorConst(const std::string& name,
                                    const std::vector<int32>& shape,
                                    GraphDef* tensorflow_graph) {
   if (HasAlreadyExportedConst(name, *tensorflow_graph)) {
@@ -367,7 +368,7 @@ void CreateReshapeShapeTensorConst(const string& name,
   }
 }
 
-string WalkUpToConstantArray(const Model& model, const string& name) {
+std::string WalkUpToConstantArray(const Model& model, const std::string& name) {
   const Array& original_array = model.GetArray(name);
   if (original_array.buffer) {
     return name;
@@ -375,7 +376,7 @@ string WalkUpToConstantArray(const Model& model, const string& name) {
   const auto* op = GetOpWithOutput(model, name);
   CHECK(op);
   CHECK(op->type == OperatorType::kFakeQuant);
-  const string& input_of_fakequant_name = op->inputs[0];
+  const std::string& input_of_fakequant_name = op->inputs[0];
   const Array& input_of_fakequant = model.GetArray(input_of_fakequant_name);
   CHECK(input_of_fakequant.buffer);
   return input_of_fakequant_name;
@@ -384,7 +385,7 @@ string WalkUpToConstantArray(const Model& model, const string& name) {
 void ConvertConvOperator(const Model& model, const ConvOperator& src_op,
                          GraphDef* tensorflow_graph) {
   const bool has_bias = src_op.inputs.size() >= 3;
-  string conv_output = src_op.outputs[0];
+  std::string conv_output = src_op.outputs[0];
   if (has_bias) {
     conv_output += "/conv";
   }
@@ -395,7 +396,7 @@ void ConvertConvOperator(const Model& model, const ConvOperator& src_op,
   *conv2d_op->add_input() = src_op.inputs[0];
   *conv2d_op->add_input() = src_op.inputs[1];
   (*conv2d_op->mutable_attr())["T"].set_type(DT_FLOAT);
-  const string& weights_array_name =
+  const std::string& weights_array_name =
       WalkUpToConstantArray(model, src_op.inputs[1]);
   const auto& weights_array = model.GetArray(weights_array_name);
   CHECK(weights_array.buffer->type == ArrayDataType::kFloat);
@@ -414,7 +415,7 @@ void ConvertConvOperator(const Model& model, const ConvOperator& src_op,
     dilations.mutable_list()->add_i(src_op.dilation_width_factor);
     dilations.mutable_list()->add_i(1);
   }
-  string padding;
+  std::string padding;
   if (src_op.padding.type == PaddingType::kSame) {
     padding = "SAME";
   } else if (src_op.padding.type == PaddingType::kValid) {
@@ -432,7 +433,7 @@ void ConvertConvOperator(const Model& model, const ConvOperator& src_op,
     biasadd_op->add_input(src_op.inputs[2]);
     (*biasadd_op->mutable_attr())["T"].set_type(DT_FLOAT);
     CHECK(model.HasArray(src_op.inputs[2]));
-    const string& bias_array_name =
+    const std::string& bias_array_name =
         WalkUpToConstantArray(model, src_op.inputs[2]);
     const auto& bias_array = model.GetArray(bias_array_name);
     // TODO(b/62904716) Bias arrays should be 1-D, and used directly.
@@ -452,7 +453,7 @@ void ConvertDepthwiseConvOperator(const Model& model,
                                   const DepthwiseConvOperator& src_op,
                                   GraphDef* tensorflow_graph) {
   const bool has_bias = src_op.inputs.size() >= 3;
-  string conv_output = src_op.outputs[0];
+  std::string conv_output = src_op.outputs[0];
   if (has_bias) {
     conv_output += "/conv";
   }
@@ -469,7 +470,7 @@ void ConvertDepthwiseConvOperator(const Model& model,
   // That's only a matter of constructing a Dims object; the actual
   // array layout is the same.
   CHECK(model.HasArray(src_op.inputs[1]));
-  const string& src_weights_name =
+  const std::string& src_weights_name =
       WalkUpToConstantArray(model, src_op.inputs[1]);
   const auto& src_weights_array = model.GetArray(src_weights_name);
   const auto& src_weights_shape = src_weights_array.shape();
@@ -505,7 +506,7 @@ void ConvertDepthwiseConvOperator(const Model& model,
     dilations.mutable_list()->add_i(src_op.dilation_width_factor);
     dilations.mutable_list()->add_i(1);
   }
-  string padding;
+  std::string padding;
   if (src_op.padding.type == PaddingType::kSame) {
     padding = "SAME";
   } else if (src_op.padding.type == PaddingType::kValid) {
@@ -523,7 +524,8 @@ void ConvertDepthwiseConvOperator(const Model& model,
     biasadd_op->add_input(src_op.inputs[2]);
     (*biasadd_op->mutable_attr())["T"].set_type(DT_FLOAT);
     CHECK(model.HasArray(src_op.inputs[2]));
-    const string& bias_name = WalkUpToConstantArray(model, src_op.inputs[2]);
+    const std::string& bias_name =
+        WalkUpToConstantArray(model, src_op.inputs[2]);
     const auto& bias_array = model.GetArray(bias_name);
     // TODO(b/62904716) Bias arrays should be 1-D, and used directly.
     Shape bias_shape_1d = bias_array.shape();
@@ -548,7 +550,7 @@ void ConvertTransposeConvOperator(const Model& model,
   *conv2d_op->add_input() = src_op.inputs[1];
   *conv2d_op->add_input() = src_op.inputs[2];
   (*conv2d_op->mutable_attr())["T"].set_type(DT_FLOAT);
-  const string& weights_array_name = WalkUpToConstantArray(
+  const std::string& weights_array_name = WalkUpToConstantArray(
       model, src_op.inputs[TransposeConvOperator::WEIGHTS]);
   const auto& weights_array = model.GetArray(weights_array_name);
   CHECK(weights_array.buffer->type == ArrayDataType::kFloat);
@@ -559,7 +561,7 @@ void ConvertTransposeConvOperator(const Model& model,
   strides.mutable_list()->add_i(src_op.stride_height);
   strides.mutable_list()->add_i(src_op.stride_width);
   strides.mutable_list()->add_i(1);
-  string padding;
+  std::string padding;
   if (src_op.padding.type == PaddingType::kSame) {
     padding = "SAME";
   } else if (src_op.padding.type == PaddingType::kValid) {
@@ -596,9 +598,9 @@ void ConvertFullyConnectedOperator(const Model& model,
                                    const FullyConnectedOperator& src_op,
                                    GraphDef* tensorflow_graph) {
   // Reshape input activations to have the shape expected by the MatMul.
-  const string reshape_output =
+  const std::string reshape_output =
       AvailableArrayName(model, src_op.outputs[0] + "/reshape");
-  const string reshape_shape =
+  const std::string reshape_shape =
       AvailableArrayName(model, reshape_output + "/shape");
   const auto& fc_weights_array = model.GetArray(src_op.inputs[1]);
   const auto& fc_weights_shape = fc_weights_array.shape();
@@ -614,7 +616,7 @@ void ConvertFullyConnectedOperator(const Model& model,
       GetTensorFlowDataType(model, src_op.inputs[0]));
 
   const bool has_bias = src_op.inputs.size() >= 3;
-  string matmul_output = src_op.outputs[0];
+  std::string matmul_output = src_op.outputs[0];
   if (has_bias) {
     matmul_output += "/matmul";
   }
@@ -622,9 +624,9 @@ void ConvertFullyConnectedOperator(const Model& model,
   // Transpose the RHS input from column-major to row-major to match TensorFlow
   // expectations. This is the inverse of the transpose we do during
   // ResolveTensorFlowMatMul.
-  const string transpose_output =
+  const std::string transpose_output =
       AvailableArrayName(model, matmul_output + "/transpose_weights");
-  const string transpose_perm =
+  const std::string transpose_perm =
       AvailableArrayName(model, transpose_output + "/perm");
   CreateIntTensorConst(transpose_perm, {1, 0}, {2}, tensorflow_graph);
   tensorflow::NodeDef* transpose_op = tensorflow_graph->add_node();
@@ -733,9 +735,9 @@ void ConvertReluOperator(const Model& model, const ReluOperator& src_op,
 
 void ConvertRelu1Operator(const Relu1Operator& src_op,
                           GraphDef* tensorflow_graph) {
-  const string max_bounds = src_op.outputs[0] + "/max_bounds";
-  const string min_bounds = src_op.outputs[0] + "/min_bounds";
-  const string max_output = src_op.outputs[0] + "/max_output";
+  const std::string max_bounds = src_op.outputs[0] + "/max_bounds";
+  const std::string min_bounds = src_op.outputs[0] + "/min_bounds";
+  const std::string max_output = src_op.outputs[0] + "/max_output";
 
   tensorflow::NodeDef* max_bounds_const_op = tensorflow_graph->add_node();
   max_bounds_const_op->set_op("Const");
@@ -808,15 +810,16 @@ void ConvertTanhOperator(const TanhOperator& src_op,
 
 void ConvertSoftmaxOperator(const Model& model, const SoftmaxOperator& src_op,
                             GraphDef* tensorflow_graph) {
-  string softmax_input;
+  std::string softmax_input;
   Operator* providing_op = GetOpWithOutput(model, src_op.inputs[0]);
   if (providing_op != nullptr && providing_op->type == OperatorType::kReshape) {
     softmax_input = src_op.inputs[0];
   } else {
     // Insert a reshape operator that reduces the dimensions down to the 2 that
     // are required for TensorFlow Logits.
-    const string reshape_output = src_op.outputs[0] + "/softmax_insert_reshape";
-    const string softmax_size = src_op.outputs[0] + "/softmax_insert_size";
+    const std::string reshape_output =
+        src_op.outputs[0] + "/softmax_insert_reshape";
+    const std::string softmax_size = src_op.outputs[0] + "/softmax_insert_size";
     softmax_input = reshape_output;
 
     tensorflow::NodeDef* reshape_op = tensorflow_graph->add_node();
@@ -848,16 +851,17 @@ void ConvertSoftmaxOperator(const Model& model, const SoftmaxOperator& src_op,
 void ConvertLogSoftmaxOperator(const Model& model,
                                const LogSoftmaxOperator& src_op,
                                GraphDef* tensorflow_graph) {
-  string softmax_input;
+  std::string softmax_input;
   Operator* providing_op = GetOpWithOutput(model, src_op.inputs[0]);
   if (providing_op != nullptr && providing_op->type == OperatorType::kReshape) {
     softmax_input = src_op.inputs[0];
   } else {
     // Insert a reshape operator that reduces the dimensions down to the 2 that
     // are required for TensorFlow Logits.
-    const string reshape_output =
+    const std::string reshape_output =
         src_op.outputs[0] + "/log_softmax_insert_reshape";
-    const string softmax_size = src_op.outputs[0] + "/log_softmax_insert_size";
+    const std::string softmax_size =
+        src_op.outputs[0] + "/log_softmax_insert_size";
     softmax_input = reshape_output;
 
     tensorflow::NodeDef* reshape_op = tensorflow_graph->add_node();
@@ -886,11 +890,12 @@ void ConvertLogSoftmaxOperator(const Model& model,
 
 void ConvertL2NormalizationOperator(const L2NormalizationOperator& src_op,
                                     GraphDef* tensorflow_graph) {
-  const string square_output = src_op.outputs[0] + "/square";
-  const string sum_reduction_indices = src_op.outputs[0] + "/reduction_indices";
-  const string sum_output = src_op.outputs[0] + "/sum";
-  const string rsqrt_output = src_op.outputs[0] + "/rsqrt";
-  const string rsqrt_tiled_output = src_op.outputs[0] + "/rsqrt_tiled";
+  const std::string square_output = src_op.outputs[0] + "/square";
+  const std::string sum_reduction_indices =
+      src_op.outputs[0] + "/reduction_indices";
+  const std::string sum_output = src_op.outputs[0] + "/sum";
+  const std::string rsqrt_output = src_op.outputs[0] + "/rsqrt";
+  const std::string rsqrt_tiled_output = src_op.outputs[0] + "/rsqrt_tiled";
 
   tensorflow::NodeDef* sum_reduction_indices_op = tensorflow_graph->add_node();
   sum_reduction_indices_op->set_op("Const");
@@ -975,7 +980,7 @@ void ConvertMaxPoolOperator(const MaxPoolOperator& src_op,
   strides.mutable_list()->add_i(src_op.stride_height);
   strides.mutable_list()->add_i(src_op.stride_width);
   strides.mutable_list()->add_i(1);
-  string padding;
+  std::string padding;
   if (src_op.padding.type == PaddingType::kSame) {
     padding = "SAME";
   } else if (src_op.padding.type == PaddingType::kValid) {
@@ -1003,7 +1008,7 @@ void ConvertAveragePoolOperator(const AveragePoolOperator& src_op,
   strides.mutable_list()->add_i(src_op.stride_height);
   strides.mutable_list()->add_i(src_op.stride_width);
   strides.mutable_list()->add_i(1);
-  string padding;
+  std::string padding;
   if (src_op.padding.type == PaddingType::kSame) {
     padding = "SAME";
   } else if (src_op.padding.type == PaddingType::kValid) {
@@ -1026,7 +1031,7 @@ void ConvertConcatenationOperator(const Model& model,
   tensorflow::NodeDef* dc_op = tensorflow_graph->add_node();
   dc_op->set_op("ConcatV2");
   dc_op->set_name(src_op.outputs[0]);
-  const string dummy_axis = src_op.outputs[0] + "/axis";
+  const std::string dummy_axis = src_op.outputs[0] + "/axis";
   CreateDummyConcatDimTensorConst(dummy_axis, src_op.axis, tensorflow_graph);
   for (const auto& input : src_op.inputs) {
     *dc_op->add_input() = input;
@@ -1060,8 +1065,8 @@ void ConvertTensorFlowReshapeOperator(const Model& model,
 
 void ConvertL2PoolOperator(const L2PoolOperator& src_op,
                            GraphDef* tensorflow_graph) {
-  const string square_output = src_op.outputs[0] + "/square";
-  const string avgpool_output = src_op.outputs[0] + "/avgpool";
+  const std::string square_output = src_op.outputs[0] + "/square";
+  const std::string avgpool_output = src_op.outputs[0] + "/avgpool";
 
   tensorflow::NodeDef* square_op = tensorflow_graph->add_node();
   square_op->set_op("Square");
@@ -1069,7 +1074,7 @@ void ConvertL2PoolOperator(const L2PoolOperator& src_op,
   *square_op->add_input() = src_op.inputs[0];
   (*square_op->mutable_attr())["T"].set_type(DT_FLOAT);
 
-  string padding;
+  std::string padding;
   if (src_op.padding.type == PaddingType::kSame) {
     padding = "SAME";
   } else if (src_op.padding.type == PaddingType::kValid) {
@@ -1235,7 +1240,7 @@ void ConvertGatherOperator(const Model& model, const GatherOperator& src_op,
   } else {
     // Constant axis.
     CHECK_EQ(src_op.inputs.size(), 2);
-    const string gather_axis =
+    const std::string gather_axis =
         AvailableArrayName(model, gather_op->name() + "/axis");
     CreateIntTensorConst(gather_axis, {src_op.axis.value()}, {},
                          tensorflow_graph);
@@ -1454,8 +1459,8 @@ absl::string_view FindLongestCommonPrefix(absl::string_view a,
 
   const char* pa = a.data();
   const char* pb = b.data();
-  string::difference_type count = 0;
-  const string::difference_type limit = std::min(a.size(), b.size());
+  std::string::difference_type count = 0;
+  const std::string::difference_type limit = std::min(a.size(), b.size());
   while (count < limit && *pa == *pb) {
     ++pa;
     ++pb;
@@ -1469,12 +1474,12 @@ absl::string_view FindLongestCommonPrefix(absl::string_view a,
 void ConvertLstmCellOperator(const Model& model, const LstmCellOperator& src_op,
                              GraphDef* tensorflow_graph) {
   // Find the base name
-  const string base(
+  const std::string base(
       FindLongestCommonPrefix(src_op.outputs[LstmCellOperator::STATE_OUTPUT],
                               src_op.outputs[LstmCellOperator::ACTIV_OUTPUT]));
 
   // Concatenate inputs
-  const string concat_output = base + "basic_lstm_cell/concat";
+  const std::string concat_output = base + "basic_lstm_cell/concat";
   // Op names have been chosen to match the tf.slim LSTM naming
   // as closely as possible.
   const int axis =
@@ -1484,7 +1489,7 @@ void ConvertLstmCellOperator(const Model& model, const LstmCellOperator& src_op,
       1;
   // Note that DATA_INPUT may have extra size 1 dimensions, but TF concat
   // works the same since the tensor has the same underlying data layout.
-  const string axis_output = concat_output + "/axis";
+  const std::string axis_output = concat_output + "/axis";
   CreateDummyConcatDimTensorConst(axis_output, axis, tensorflow_graph);
   tensorflow::NodeDef* concat_op = tensorflow_graph->add_node();
   concat_op->set_op("ConcatV2");
@@ -1497,9 +1502,9 @@ void ConvertLstmCellOperator(const Model& model, const LstmCellOperator& src_op,
   (*concat_op->mutable_attr())["N"].set_i(2);  // Number of inputs
 
   // Write weights
-  const string weights_output = base + "weights";
+  const std::string weights_output = base + "weights";
   CHECK(model.HasArray(src_op.inputs[LstmCellOperator::WEIGHTS_INPUT]));
-  const string weights_name = WalkUpToConstantArray(
+  const std::string weights_name = WalkUpToConstantArray(
       model, src_op.inputs[LstmCellOperator::WEIGHTS_INPUT]);
   const auto& weights_array = model.GetArray(weights_name);
   // Convert 4D FullyConnected weights into 2D matrix
@@ -1513,7 +1518,7 @@ void ConvertLstmCellOperator(const Model& model, const LstmCellOperator& src_op,
                           AxesOrder::kCR, AxesOrder::kRC, tensorflow_graph);
 
   // Fully connected matrix multiply
-  const string matmul_output = base + "MatMul";
+  const std::string matmul_output = base + "MatMul";
   tensorflow::NodeDef* matmul_op = tensorflow_graph->add_node();
   matmul_op->set_op("MatMul");
   matmul_op->set_name(matmul_output);
@@ -1524,9 +1529,9 @@ void ConvertLstmCellOperator(const Model& model, const LstmCellOperator& src_op,
   (*matmul_op->mutable_attr())["T"].set_type(DT_FLOAT);
 
   // Write biases
-  const string biases_output = base + "biases";
+  const std::string biases_output = base + "biases";
   CHECK(model.HasArray(src_op.inputs[LstmCellOperator::BIASES_INPUT]));
-  const string bias_name = WalkUpToConstantArray(
+  const std::string bias_name = WalkUpToConstantArray(
       model, src_op.inputs[LstmCellOperator::BIASES_INPUT]);
   const auto& bias_array = model.GetArray(bias_name);
   // TODO(b/62904716) Bias arrays should be 1-D, and used directly.
@@ -1542,7 +1547,7 @@ void ConvertLstmCellOperator(const Model& model, const LstmCellOperator& src_op,
                           LegacyScalarPolicy::kDoCreateLegacyScalars);
 
   // Add biases
-  string biasadd_output = base + "BiasAdd";
+  std::string biasadd_output = base + "BiasAdd";
   tensorflow::NodeDef* biasadd_op = tensorflow_graph->add_node();
   biasadd_op->set_op("BiasAdd");
   biasadd_op->set_name(biasadd_output);
@@ -1552,10 +1557,10 @@ void ConvertLstmCellOperator(const Model& model, const LstmCellOperator& src_op,
   (*biasadd_op->mutable_attr())["T"].set_type(DT_FLOAT);
 
   // Split
-  string split_dim_output = base + "split/split_dim";
+  std::string split_dim_output = base + "split/split_dim";
   // The dimension is the same as the concatenation dimension
   CreateDummyConcatDimTensorConst(split_dim_output, axis, tensorflow_graph);
-  string split_output = base + "split";
+  std::string split_output = base + "split";
   tensorflow::NodeDef* split_op = tensorflow_graph->add_node();
   split_op->set_op("Split");
   split_op->set_name(split_output);
@@ -1565,21 +1570,21 @@ void ConvertLstmCellOperator(const Model& model, const LstmCellOperator& src_op,
   (*split_op->mutable_attr())["num_split"].set_i(4);  // Split into four outputs
 
   // Activation functions and memory computations
-  const string tanh_0_output = base + "Tanh";
+  const std::string tanh_0_output = base + "Tanh";
   tensorflow::NodeDef* tanh_0_op = tensorflow_graph->add_node();
   tanh_0_op->set_op("Tanh");
   tanh_0_op->set_name(tanh_0_output);
   *tanh_0_op->add_input() = split_output + ":1";
   (*tanh_0_op->mutable_attr())["T"].set_type(DT_FLOAT);
 
-  const string sigmoid_1_output = base + "Sigmoid_1";
+  const std::string sigmoid_1_output = base + "Sigmoid_1";
   tensorflow::NodeDef* logistic_1_op = tensorflow_graph->add_node();
   logistic_1_op->set_op("Sigmoid");
   logistic_1_op->set_name(sigmoid_1_output);
   *logistic_1_op->add_input() = split_output;
   (*logistic_1_op->mutable_attr())["T"].set_type(DT_FLOAT);
 
-  const string mul_1_output = base + "mul_1";
+  const std::string mul_1_output = base + "mul_1";
   tensorflow::NodeDef* mul_1_op = tensorflow_graph->add_node();
   mul_1_op->set_op("Mul");
   mul_1_op->set_name(mul_1_output);
@@ -1587,21 +1592,21 @@ void ConvertLstmCellOperator(const Model& model, const LstmCellOperator& src_op,
   *mul_1_op->add_input() = tanh_0_output;
   (*mul_1_op->mutable_attr())["T"].set_type(DT_FLOAT);
 
-  const string sigmoid_0_output = base + "Sigmoid";
+  const std::string sigmoid_0_output = base + "Sigmoid";
   tensorflow::NodeDef* logistic_2_op = tensorflow_graph->add_node();
   logistic_2_op->set_op("Sigmoid");
   logistic_2_op->set_name(sigmoid_0_output);
   *logistic_2_op->add_input() = split_output + ":2";
   (*logistic_2_op->mutable_attr())["T"].set_type(DT_FLOAT);
 
-  const string sigmoid_2_output = base + "Sigmoid_2";
+  const std::string sigmoid_2_output = base + "Sigmoid_2";
   tensorflow::NodeDef* logistic_3_op = tensorflow_graph->add_node();
   logistic_3_op->set_op("Sigmoid");
   logistic_3_op->set_name(sigmoid_2_output);
   *logistic_3_op->add_input() = split_output + ":3";
   (*logistic_3_op->mutable_attr())["T"].set_type(DT_FLOAT);
 
-  const string mul_0_output = base + "mul";
+  const std::string mul_0_output = base + "mul";
   tensorflow::NodeDef* mul_0_op = tensorflow_graph->add_node();
   mul_0_op->set_op("Mul");
   mul_0_op->set_name(mul_0_output);
@@ -1609,7 +1614,8 @@ void ConvertLstmCellOperator(const Model& model, const LstmCellOperator& src_op,
   *mul_0_op->add_input() = sigmoid_0_output;
   (*mul_0_op->mutable_attr())["T"].set_type(DT_FLOAT);
 
-  const string add_1_output = src_op.outputs[LstmCellOperator::STATE_OUTPUT];
+  const std::string add_1_output =
+      src_op.outputs[LstmCellOperator::STATE_OUTPUT];
   tensorflow::NodeDef* add_1_op = tensorflow_graph->add_node();
   add_1_op->set_op("Add");
   add_1_op->set_name(add_1_output);
@@ -1617,14 +1623,15 @@ void ConvertLstmCellOperator(const Model& model, const LstmCellOperator& src_op,
   *add_1_op->add_input() = mul_1_output;
   (*add_1_op->mutable_attr())["T"].set_type(DT_FLOAT);
 
-  const string tanh_1_output = base + "Tanh_1";
+  const std::string tanh_1_output = base + "Tanh_1";
   tensorflow::NodeDef* tanh_1_op = tensorflow_graph->add_node();
   tanh_1_op->set_op("Tanh");
   tanh_1_op->set_name(tanh_1_output);
   *tanh_1_op->add_input() = add_1_output;
   (*tanh_1_op->mutable_attr())["T"].set_type(DT_FLOAT);
 
-  const string mul_2_output = src_op.outputs[LstmCellOperator::ACTIV_OUTPUT];
+  const std::string mul_2_output =
+      src_op.outputs[LstmCellOperator::ACTIV_OUTPUT];
   tensorflow::NodeDef* mul_2_op = tensorflow_graph->add_node();
   mul_2_op->set_op("Mul");
   mul_2_op->set_name(mul_2_output);
@@ -1730,7 +1737,8 @@ void ConvertPadV2Operator(const Model& model, const PadV2Operator& src_op,
   shape->add_dim()->set_size(2);
 }
 
-void CreateSliceInput(const string& input_name, const std::vector<int>& values,
+void CreateSliceInput(const std::string& input_name,
+                      const std::vector<int>& values,
                       GraphDef* tensorflow_graph) {
   tensorflow::NodeDef* params_op = tensorflow_graph->add_node();
   params_op->set_op("Const");
@@ -1797,7 +1805,8 @@ void ConvertSliceOperator(const Model& model, const SliceOperator& src_op,
 
 template <typename T>
 void ConvertReduceOperator(const Model& model, const T& src_op,
-                           GraphDef* tensorflow_graph, const string& op_name) {
+                           GraphDef* tensorflow_graph,
+                           const std::string& op_name) {
   tensorflow::NodeDef* new_op = tensorflow_graph->add_node();
   new_op->set_op(op_name);
   new_op->set_name(src_op.outputs[0]);
@@ -2412,7 +2421,7 @@ void ConvertOperator(const Model& model, const Operator& src_op,
   }
 }
 
-void AddPlaceholder(const string& name, ArrayDataType type,
+void AddPlaceholder(const std::string& name, ArrayDataType type,
                     GraphDef* tensorflow_graph) {
   tensorflow::NodeDef* placeholder = tensorflow_graph->add_node();
   placeholder->set_op("Placeholder");
@@ -2444,8 +2453,8 @@ void AddPlaceholder(const string& name, ArrayDataType type,
   placeholder->set_name(name);
 }
 
-void AddPlaceholderForRNNState(const Model& model, const string& name, int size,
-                               GraphDef* tensorflow_graph) {
+void AddPlaceholderForRNNState(const Model& model, const std::string& name,
+                               int size, GraphDef* tensorflow_graph) {
   tensorflow::NodeDef* placeholder = tensorflow_graph->add_node();
   placeholder->set_op("Placeholder");
   placeholder->set_name(name);
@@ -2484,7 +2493,7 @@ void ExportTensorFlowGraphDefImplementation(const Model& model,
   // after, as some operators need to export arrays that they reference
   // in a specific way, rather than in the generic way done below.
   for (const auto& array_pair : model.GetArrayMap()) {
-    const string& array_name = array_pair.first;
+    const std::string& array_name = array_pair.first;
     const auto& array = *array_pair.second;
     if (array.buffer) {
       switch (array.data_type) {
@@ -2510,12 +2519,12 @@ void ExportTensorFlowGraphDefImplementation(const Model& model,
 
 void EncodeConstantArraysMinMaxByWrappingThemInFakeQuantNodes(Model* model) {
   for (const auto& array_kv : model->GetArrayMap()) {
-    const string& array_name = array_kv.first;
+    const std::string& array_name = array_kv.first;
     Array& array = *array_kv.second;
     if (!array.buffer || !array.minmax) {
       continue;
     }
-    const string& wrapped_array_name =
+    const std::string& wrapped_array_name =
         AvailableArrayName(*model, array_name + "/data");
     Array& wrapped_array = model->GetOrCreateArray(wrapped_array_name);
     wrapped_array.data_type = array.data_type;
@@ -2533,7 +2542,7 @@ void EncodeConstantArraysMinMaxByWrappingThemInFakeQuantNodes(Model* model) {
 }
 
 void ExportTensorFlowGraphDef(const Model& model,
-                              string* output_file_contents) {
+                              std::string* output_file_contents) {
   CHECK(output_file_contents->empty());
   GraphDef tensorflow_graph;
   ExportTensorFlowGraphDefImplementation(model, &tensorflow_graph);
diff --git a/tensorflow/lite/toco/export_tensorflow.h b/tensorflow/lite/toco/export_tensorflow.h
index 09c966ded62..bc7ccd8d875 100644
--- a/tensorflow/lite/toco/export_tensorflow.h
+++ b/tensorflow/lite/toco/export_tensorflow.h
@@ -20,7 +20,8 @@ limitations under the License.
 
 namespace toco {
 
-void ExportTensorFlowGraphDef(const Model& model, string* output_file_contents);
+void ExportTensorFlowGraphDef(const Model& model,
+                              std::string* output_file_contents);
 
 void EncodeConstantArraysMinMaxByWrappingThemInFakeQuantNodes(Model* model);
 
diff --git a/tensorflow/lite/toco/format_port.h b/tensorflow/lite/toco/format_port.h
index 3c154e5ad48..47d39069c9f 100644
--- a/tensorflow/lite/toco/format_port.h
+++ b/tensorflow/lite/toco/format_port.h
@@ -38,13 +38,13 @@ inline const char* IdentityOrConvertStringToRaw(const std::string& foo) {
 
 // Delegate to TensorFlow Appendf function until absl has an equivalent.
 template <typename... Args>
-inline void AppendFHelper(string* destination, const char* fmt,
+inline void AppendFHelper(std::string* destination, const char* fmt,
                           Args&&... args) {
   tensorflow::strings::Appendf(destination, fmt, args...);
 }
 
 // Specialization for no argument format string (avoid security bug).
-inline void AppendFHelper(string* destination, const char* fmt) {
+inline void AppendFHelper(std::string* destination, const char* fmt) {
   tensorflow::strings::Appendf(destination, "%s", fmt);
 }
 
@@ -52,15 +52,15 @@ inline void AppendFHelper(string* destination, const char* fmt) {
 // pointed to by destination. fmt follows C printf semantics.
 // One departure is that %s can be driven by a std::string or string.
 template <typename... Args>
-inline void AppendF(string* destination, const char* fmt, Args&&... args) {
+inline void AppendF(std::string* destination, const char* fmt, Args&&... args) {
   AppendFHelper(destination, fmt, IdentityOrConvertStringToRaw(args)...);
 }
 
 // Return formatted string (with format fmt and args args). fmt follows C printf
 // semantics. One departure is that %s can be driven by a std::string or string.
 template <typename... Args>
-inline string StringF(const char* fmt, Args&&... args) {
-  string result;
+inline std::string StringF(const char* fmt, Args&&... args) {
+  std::string result;
   AppendFHelper(&result, fmt, IdentityOrConvertStringToRaw(args)...);
   return result;
 }
diff --git a/tensorflow/lite/toco/graph_transformations/convert_expanddims_to_reshape.cc b/tensorflow/lite/toco/graph_transformations/convert_expanddims_to_reshape.cc
index 6b632fb5f46..6695f5a844b 100644
--- a/tensorflow/lite/toco/graph_transformations/convert_expanddims_to_reshape.cc
+++ b/tensorflow/lite/toco/graph_transformations/convert_expanddims_to_reshape.cc
@@ -76,8 +76,9 @@ namespace toco {
   reshape_op->outputs = expand_op->outputs;
 
   // Create a new input array
-  string axis_array_name = expand_op->inputs[1];
-  string shape_array_name = toco::AvailableArrayName(*model, axis_array_name);
+  std::string axis_array_name = expand_op->inputs[1];
+  std::string shape_array_name =
+      toco::AvailableArrayName(*model, axis_array_name);
   Array& shape_array = model->GetOrCreateArray(shape_array_name);
   *(shape_array.mutable_shape()->mutable_dims()) = {
       1, static_cast<int>(reshape_dims.size())};
diff --git a/tensorflow/lite/toco/graph_transformations/convert_reorder_axes.cc b/tensorflow/lite/toco/graph_transformations/convert_reorder_axes.cc
index 1440cd1c1a7..c2214d6f97c 100644
--- a/tensorflow/lite/toco/graph_transformations/convert_reorder_axes.cc
+++ b/tensorflow/lite/toco/graph_transformations/convert_reorder_axes.cc
@@ -41,7 +41,7 @@ TensorFlowReshapeOperator* CreateReshapeFromReorderAxes(
                                    input_shape.dims(3) * input_shape.dims(2)};
 
   // Create a new input array for Reshape.
-  string reshape_array_name =
+  std::string reshape_array_name =
       AvailableArrayName(*model, reshape_op->outputs[0]);
   reshape_op->inputs.push_back(reshape_array_name);
 
@@ -71,7 +71,8 @@ TransposeOperator* CreateTransposeFromReorderAxes(
   GetShuffleShape(input_axes_order, output_axes_order, &permutations_data);
 
   // Create a new input permutations array for Transpose.
-  string perm_array_name = AvailableArrayName(*model, transpose_op->outputs[0]);
+  std::string perm_array_name =
+      AvailableArrayName(*model, transpose_op->outputs[0]);
   transpose_op->inputs.push_back(perm_array_name);
 
   Array& perm_array = model->GetOrCreateArray(perm_array_name);
@@ -104,7 +105,7 @@ TransposeOperator* CreateTransposeFromReorderAxes(
 
   // Get input array. If kFakeQuant is the input into ReorderAxes, get the input
   // array passed into kFakeQuant. kFakeQuant op is dropped when possible.
-  string constant_input_array_name = input_array_name;
+  std::string constant_input_array_name = input_array_name;
   if (!input_array.buffer) {
     const auto* op_producing_input = GetOpWithOutput(*model, input_array_name);
     if (op_producing_input &&
diff --git a/tensorflow/lite/toco/graph_transformations/convert_trivial_pack_to_reshape.cc b/tensorflow/lite/toco/graph_transformations/convert_trivial_pack_to_reshape.cc
index 31f9ced8cf5..2ad6280b955 100644
--- a/tensorflow/lite/toco/graph_transformations/convert_trivial_pack_to_reshape.cc
+++ b/tensorflow/lite/toco/graph_transformations/convert_trivial_pack_to_reshape.cc
@@ -59,7 +59,7 @@ namespace toco {
   reshape_op->outputs = pack_op->outputs;
 
   // Create shape param.
-  string shape_array_name =
+  std::string shape_array_name =
       AvailableArrayName(*model, pack_op->outputs[0] + "_shape");
   Array& shape_array = model->GetOrCreateArray(shape_array_name);
   const int shape_array_dims = 1 + input_array.shape().dimensions_count();
diff --git a/tensorflow/lite/toco/graph_transformations/convert_trivial_transpose_to_reshape.cc b/tensorflow/lite/toco/graph_transformations/convert_trivial_transpose_to_reshape.cc
index 2b5aaea2b23..2d1e5090f4a 100644
--- a/tensorflow/lite/toco/graph_transformations/convert_trivial_transpose_to_reshape.cc
+++ b/tensorflow/lite/toco/graph_transformations/convert_trivial_transpose_to_reshape.cc
@@ -90,8 +90,9 @@ bool TransposeAffectsMemoryOrder(std::vector<int> perm,
   reshape_op->outputs = transpose_op->outputs;
 
   // Create a new input array for the shape input
-  string perm_array_name = transpose_op->inputs[1];
-  string shape_array_name = toco::AvailableArrayName(*model, perm_array_name);
+  std::string perm_array_name = transpose_op->inputs[1];
+  std::string shape_array_name =
+      toco::AvailableArrayName(*model, perm_array_name);
   Array& shape_array = model->GetOrCreateArray(shape_array_name);
   *(shape_array.mutable_shape()->mutable_dims()) = {
       1, static_cast<int>(output_dims.size())};
diff --git a/tensorflow/lite/toco/graph_transformations/create_im2col_arrays.cc b/tensorflow/lite/toco/graph_transformations/create_im2col_arrays.cc
index 8e93bc23789..a788b81672d 100644
--- a/tensorflow/lite/toco/graph_transformations/create_im2col_arrays.cc
+++ b/tensorflow/lite/toco/graph_transformations/create_im2col_arrays.cc
@@ -49,7 +49,7 @@ bool ProcessConvOperator(Model* model, ConvOperator* op) {
 
   // Create the im2col array.
   CHECK_EQ(op->outputs.size(), 1);
-  const string& im2col_array_name =
+  const std::string& im2col_array_name =
       AvailableArrayName(*model, op->inputs[0] + "_im2col");
   model->GetOrCreateArray(im2col_array_name);
   op->outputs.push_back(im2col_array_name);
@@ -65,7 +65,7 @@ bool ProcessTransposeConvOperator(Model* model, TransposeConvOperator* op) {
 
   // Always create an im2col array for transpose_conv.
   CHECK_EQ(op->outputs.size(), 1);
-  const string& im2col_array_name = AvailableArrayName(
+  const std::string& im2col_array_name = AvailableArrayName(
       *model, op->inputs[TransposeConvOperator::DATA_INPUT] + "_im2col");
   model->GetOrCreateArray(im2col_array_name);
   op->outputs.push_back(im2col_array_name);
diff --git a/tensorflow/lite/toco/graph_transformations/dequantize.cc b/tensorflow/lite/toco/graph_transformations/dequantize.cc
index cc5dddbb40e..6872fc47344 100644
--- a/tensorflow/lite/toco/graph_transformations/dequantize.cc
+++ b/tensorflow/lite/toco/graph_transformations/dequantize.cc
@@ -41,7 +41,7 @@ void DequantizeBuffer(Array* array) {
 }
 
 std::vector<std::unique_ptr<Operator>>::iterator FindFirstOpWithInput(
-    Model* model, const string& array_name) {
+    Model* model, const std::string& array_name) {
   for (auto it = model->operators.begin(); it != model->operators.end(); ++it) {
     for (const auto& input : it->get()->inputs) {
       if (input == array_name) {
@@ -52,7 +52,7 @@ std::vector<std::unique_ptr<Operator>>::iterator FindFirstOpWithInput(
   return model->operators.end();
 }
 
-void ClearArrayQuantizationParams(const string& array_name, Model* model) {
+void ClearArrayQuantizationParams(const std::string& array_name, Model* model) {
   auto* array = &model->GetArray(array_name);
   CHECK(array->quantization_params);
   for (auto& input_array : *model->flags.mutable_input_arrays()) {
@@ -75,7 +75,7 @@ void ClearArrayQuantizationParams(const string& array_name, Model* model) {
   array->quantization_params = nullptr;
 }
 
-bool DequantizeArray(const string& array_name,
+bool DequantizeArray(const std::string& array_name,
                      GraphTransformation* transformation, Model* model) {
   auto* array = &model->GetArray(array_name);
   if (!array->quantization_params) {
@@ -133,7 +133,7 @@ bool DequantizeArray(const string& array_name,
   if (IsInputArray(*model, array_name)) {
     must_insert_fakequant_after = true;
   }
-  for (const string& output_array : model->flags.output_arrays()) {
+  for (const std::string& output_array : model->flags.output_arrays()) {
     if (array_name == output_array) {
       must_insert_fakequant_before = true;
     }
@@ -152,7 +152,7 @@ bool DequantizeArray(const string& array_name,
   auto* fakequant_op = new FakeQuantOperator;
   model->operators.emplace(FindFirstOpWithInput(model, array_name),
                            fakequant_op);
-  const string& new_array_name = AvailableArrayName(*model, array_name);
+  const std::string& new_array_name = AvailableArrayName(*model, array_name);
   auto& new_array = model->GetOrCreateArray(new_array_name);
   new_array.data_type = ArrayDataType::kFloat;
   new_array.copy_shape(array->shape());
@@ -162,7 +162,7 @@ bool DequantizeArray(const string& array_name,
   fakequant_op->narrow_range = array->narrow_range;
   if (must_insert_fakequant_before) {
     for (const auto& op : model->operators) {
-      for (string& output : op->outputs) {
+      for (std::string& output : op->outputs) {
         if (output == array_name) {
           output = new_array_name;
         }
@@ -172,7 +172,7 @@ bool DequantizeArray(const string& array_name,
     fakequant_op->outputs = {array_name};
   } else {
     for (const auto& op : model->operators) {
-      for (string& input : op->inputs) {
+      for (std::string& input : op->inputs) {
         if (input == array_name) {
           input = new_array_name;
         }
@@ -209,15 +209,15 @@ bool DequantizeArray(const string& array_name,
     return ::tensorflow::Status::OK();
   }
 
-  std::vector<string> arrays;
-  for (const string& input : op->inputs) {
+  std::vector<std::string> arrays;
+  for (const std::string& input : op->inputs) {
     arrays.push_back(input);
   }
-  for (const string& output : op->outputs) {
+  for (const std::string& output : op->outputs) {
     arrays.push_back(output);
   }
   bool changed = false;
-  for (const string& array : arrays) {
+  for (const std::string& array : arrays) {
     if (!model->IsOptionalArray(array)) {
       changed |= DequantizeArray(array, this, model);
     }
diff --git a/tensorflow/lite/toco/graph_transformations/ensure_bias_vectors.cc b/tensorflow/lite/toco/graph_transformations/ensure_bias_vectors.cc
index fcad8bc0086..09bd9aedcf0 100644
--- a/tensorflow/lite/toco/graph_transformations/ensure_bias_vectors.cc
+++ b/tensorflow/lite/toco/graph_transformations/ensure_bias_vectors.cc
@@ -27,7 +27,7 @@ namespace toco {
 namespace {
 
 int GetOutputDepthFromWeights(const Model& model, const Operator& op) {
-  const string& weights_name = op.inputs[1];
+  const std::string& weights_name = op.inputs[1];
   const auto& weights_shape = model.GetArray(weights_name).shape();
   if (op.type == OperatorType::kConv ||
       op.type == OperatorType::kFullyConnected ||
@@ -56,13 +56,14 @@ bool ProcessLinearOperator(Model* model, Operator* op) {
   if (CheckOpInputSize(*op)) {
     return false;
   }
-  const string& output_name = op->outputs[0];
-  const string& weights_name = op->inputs[1];
+  const std::string& output_name = op->outputs[0];
+  const std::string& weights_name = op->inputs[1];
   if (!model->GetArray(weights_name).has_shape()) {
     return false;
   }
   const int depth = GetOutputDepthFromWeights(*model, *op);
-  const string& bias_name = AvailableArrayName(*model, output_name + "_bias");
+  const std::string& bias_name =
+      AvailableArrayName(*model, output_name + "_bias");
   op->inputs.push_back(bias_name);
   auto& bias_array = model->GetOrCreateArray(bias_name);
   bias_array.data_type = ArrayDataType::kFloat;
diff --git a/tensorflow/lite/toco/graph_transformations/ensure_uint8_weights_safe_for_fast_int8_kernels.cc b/tensorflow/lite/toco/graph_transformations/ensure_uint8_weights_safe_for_fast_int8_kernels.cc
index 918bb489995..5854e74b507 100644
--- a/tensorflow/lite/toco/graph_transformations/ensure_uint8_weights_safe_for_fast_int8_kernels.cc
+++ b/tensorflow/lite/toco/graph_transformations/ensure_uint8_weights_safe_for_fast_int8_kernels.cc
@@ -152,7 +152,7 @@ namespace toco {
       return ::tensorflow::Status::OK();
   }
 
-  const string& name = op.inputs[weights_index];
+  const std::string& name = op.inputs[weights_index];
   auto& array = model->GetArray(name);
   if (!array.buffer) {
     return ::tensorflow::Status::OK();
diff --git a/tensorflow/lite/toco/graph_transformations/fuse_binary_into_preceding_affine.cc b/tensorflow/lite/toco/graph_transformations/fuse_binary_into_preceding_affine.cc
index 05a2fecf31d..1de12b4f959 100644
--- a/tensorflow/lite/toco/graph_transformations/fuse_binary_into_preceding_affine.cc
+++ b/tensorflow/lite/toco/graph_transformations/fuse_binary_into_preceding_affine.cc
@@ -260,7 +260,7 @@ void FuseMulOrDivParamsIntoPrecedingAffine(Model* model, Operator* preceding_op,
     return ::tensorflow::Status::OK();
   }
 
-  for (const string& output_array : model->flags.output_arrays()) {
+  for (const std::string& output_array : model->flags.output_arrays()) {
     if (preceding_op->outputs[0] == output_array) {
       return ::tensorflow::Status::OK();
     }
diff --git a/tensorflow/lite/toco/graph_transformations/graph_transformations.cc b/tensorflow/lite/toco/graph_transformations/graph_transformations.cc
index e4eb7698597..bee666531a7 100644
--- a/tensorflow/lite/toco/graph_transformations/graph_transformations.cc
+++ b/tensorflow/lite/toco/graph_transformations/graph_transformations.cc
@@ -29,7 +29,7 @@ namespace toco {
 
 namespace {
 
-void PrintModelStats(const string& label, const Model& model) {
+void PrintModelStats(const std::string& label, const Model& model) {
   int quantized_arrays = 0;
   for (const auto& array : model.GetArrayMap()) {
     if (array.second->quantization_params) {
@@ -57,8 +57,8 @@ void PrintModelStats(const string& label, const Model& model) {
 void DiscardUselessConnectedComponentsAndRNNBackEdges(Model* model) {
   // Identify the set of arrays that are in 'useful' connected components
   // of the graph, which means connected to output arrays.
-  std::unordered_set<string> useful_arrays;
-  for (const string& output_array : model->flags.output_arrays()) {
+  std::unordered_set<std::string> useful_arrays;
+  for (const std::string& output_array : model->flags.output_arrays()) {
     useful_arrays.insert(output_array);
   }
   bool found_new_useful_arrays;
@@ -66,15 +66,15 @@ void DiscardUselessConnectedComponentsAndRNNBackEdges(Model* model) {
     found_new_useful_arrays = false;
     for (const auto& op : model->operators) {
       bool op_touches_useful_arrays = false;
-      for (const string& output : op->outputs) {
+      for (const std::string& output : op->outputs) {
         op_touches_useful_arrays |= useful_arrays.count(output);
       }
       if (op_touches_useful_arrays) {
-        for (const string& input : op->inputs) {
+        for (const std::string& input : op->inputs) {
           found_new_useful_arrays |= !useful_arrays.count(input);
           useful_arrays.insert(input);
         }
-        for (const string& output : op->outputs) {
+        for (const std::string& output : op->outputs) {
           found_new_useful_arrays |= !useful_arrays.count(output);
           useful_arrays.insert(output);
         }
@@ -91,7 +91,7 @@ void DiscardUselessConnectedComponentsAndRNNBackEdges(Model* model) {
     }
   } while (found_new_useful_arrays);
   // Erase arrays that aren't useful, and that are discardable.
-  model->EraseArrays([&](const string& name) {
+  model->EraseArrays([&](const std::string& name) {
     return (!useful_arrays.count(name) && IsDiscardableArray(*model, name));
   });
   // Erase operators that do not produce a useful output array.
@@ -101,7 +101,7 @@ void DiscardUselessConnectedComponentsAndRNNBackEdges(Model* model) {
     if (useful_arrays.count((*it)->outputs[0])) {
       ++it;
     } else {
-      for (const string& output : (*it)->outputs) {
+      for (const std::string& output : (*it)->outputs) {
         CHECK(!useful_arrays.count(output));
       }
       it = model->operators.erase(it);
@@ -156,7 +156,7 @@ bool GraphTransformationsPass(int increment, Model* model,
                         << " at op_index=" << op_index << "/"
                         << model->operators.size() - 1;
       }
-      for (const string& message : transformation->Messages()) {
+      for (const std::string& message : transformation->Messages()) {
         VLOG(log_level) << transformation->Name() << " " << made_a_change_msg
                         << " at op_index=" << op_index << "/"
                         << model->operators.size() - 1 << ": " << message;
@@ -191,7 +191,7 @@ bool GraphTransformationsPass(int increment, Model* model,
 }  // namespace
 
 tensorflow::Status RunGraphTransformationsWithStatus(
-    Model* model, const string& msg,
+    Model* model, const std::string& msg,
     const GraphTransformationsSet& transformations) {
   PrintModelStats(toco::port::StringF("Before %s", msg), *model);
   int pass_index = 0;
diff --git a/tensorflow/lite/toco/graph_transformations/graph_transformations.h b/tensorflow/lite/toco/graph_transformations/graph_transformations.h
index 07b9fd4c5cf..4d7278fcaf9 100644
--- a/tensorflow/lite/toco/graph_transformations/graph_transformations.h
+++ b/tensorflow/lite/toco/graph_transformations/graph_transformations.h
@@ -33,7 +33,7 @@ class GraphTransformation {
   virtual ~GraphTransformation() {}
   // Returns the list of messages that this graph transformation
   // generated since ClearMessages() was called.
-  const std::vector<string>& Messages() const { return messages_; }
+  const std::vector<std::string>& Messages() const { return messages_; }
   // Clears the list of messages; should be called after every
   // run of this graph transformation.
   void ClearMessages() { return messages_.clear(); }
@@ -48,7 +48,7 @@ class GraphTransformation {
   GraphTransformation() {}
 
   // List of messages generated by this graph transformation.
-  std::vector<string> messages_;
+  std::vector<std::string> messages_;
 
  private:
   GraphTransformation(const GraphTransformation& other) = delete;
@@ -74,7 +74,7 @@ class GraphTransformationsSet {
     }
   }
   void Add(GraphTransformation* transformation) {
-    const string& name = transformation->Name();
+    const std::string& name = transformation->Name();
     CHECK(!names_.count(name));
     names_.insert(name);
     transformations_.emplace_back(transformation);
@@ -92,7 +92,7 @@ class GraphTransformationsSet {
   GraphTransformationsSet(const GraphTransformationsSet&& other) = delete;
   std::vector<std::unique_ptr<GraphTransformation>> transformations_;
   // Names of transformations in the set. Only used to guard against dupes.
-  std::unordered_set<string> names_;
+  std::unordered_set<std::string> names_;
 };
 
 // Run the given list of graph transformations on the model.
@@ -103,11 +103,11 @@ class GraphTransformationsSet {
 // the resulting raw pointers, and this RunGraphTransformations
 // takes care of delete'ing these pointers.
 tensorflow::Status RunGraphTransformationsWithStatus(
-    Model* model, const string& msg,
+    Model* model, const std::string& msg,
     const GraphTransformationsSet& transformations);
 
 inline void RunGraphTransformations(
-    Model* model, const string& msg,
+    Model* model, const std::string& msg,
     const GraphTransformationsSet& transformations) {
   auto s = RunGraphTransformationsWithStatus(model, msg, transformations);
   CHECK(s.ok()) << s.error_message();
@@ -232,7 +232,7 @@ class PropagateDefaultMinMax : public GraphTransformation {
   }
 
  private:
-  bool SetArrayMinMax(const string& array_name, Array* array);
+  bool SetArrayMinMax(const std::string& array_name, Array* array);
   std::vector<std::pair<ArrayDataType, MinMax>> type_ranges_;
 };
 
diff --git a/tensorflow/lite/toco/graph_transformations/group_bidirectional_sequence_ops.cc b/tensorflow/lite/toco/graph_transformations/group_bidirectional_sequence_ops.cc
index fa252b1a61b..4f9caeb77b0 100644
--- a/tensorflow/lite/toco/graph_transformations/group_bidirectional_sequence_ops.cc
+++ b/tensorflow/lite/toco/graph_transformations/group_bidirectional_sequence_ops.cc
@@ -197,7 +197,7 @@ void ConstructBidirectionalSequenceOp(
   constexpr int kBwInputActivationStartIndex = 37;
   constexpr int kAuxInputStartIndex = 39;
   (*bi_op)->inputs.reserve(kBidirectionalSequenceLstmInputsCount);
-  const string& input_array_name =
+  const std::string& input_array_name =
       AvailableArrayName(*model, "bidirectional_sequence_lstm_input_0");
   model->GetOrCreateArray(input_array_name);
   // The input will be changed later.
@@ -232,7 +232,7 @@ void ConstructBidirectionalSequenceOp(
 
   // TODO(renjieliu): Deal with Auxiliary input and weights for 39 - 47.
   for (; i <= kBidirectionalSequenceLstmInputsCount; ++i) {
-    const string& temp_array_name = AvailableArrayName(
+    const std::string& temp_array_name = AvailableArrayName(
         *model, "bidirectional_sequence_lstm_temp_" + std::to_string(i));
     model->CreateOptionalArray(temp_array_name);
     (*bi_op)->inputs.push_back(temp_array_name);
@@ -240,9 +240,9 @@ void ConstructBidirectionalSequenceOp(
 
   // Deal with outputs.
   (*bi_op)->outputs.reserve(2);
-  const string& fw_output_array_name =
+  const std::string& fw_output_array_name =
       AvailableArrayName(*model, "bidirectional_sequence_lstm_fw_output_0");
-  const string& bw_output_array_name =
+  const std::string& bw_output_array_name =
       AvailableArrayName(*model, "bidirectional_sequence_lstm_bw_output_0");
   model->GetOrCreateArray(fw_output_array_name);
   model->GetOrCreateArray(bw_output_array_name);
@@ -260,7 +260,7 @@ void ConstructBidirectionalSequenceOp(
   constexpr int kBwInputsStartIndex = 5;
   constexpr int kAuxInputsStartIndex = 9;
   (*bi_op)->inputs.reserve(kBidirectionalSequenceRnnInputsCount);
-  const string& input_array_name =
+  const std::string& input_array_name =
       AvailableArrayName(*model, "bidirectional_sequence_rnn_input_0");
   model->GetOrCreateArray(input_array_name);
   // The input will be changed later.
@@ -280,7 +280,7 @@ void ConstructBidirectionalSequenceOp(
 
   // TODO(renjieliu): Deal with optional weights.
   for (; i < kBidirectionalSequenceRnnInputsCount; ++i) {
-    const string& temp_array_name = AvailableArrayName(
+    const std::string& temp_array_name = AvailableArrayName(
         *model, "bidirectional_sequence_rnn_temp_" + std::to_string(i));
     model->CreateOptionalArray(temp_array_name);
     (*bi_op)->inputs.push_back(temp_array_name);
@@ -288,9 +288,9 @@ void ConstructBidirectionalSequenceOp(
 
   // Deal with outputs.
   (*bi_op)->outputs.reserve(2);
-  const string& fw_output_array_name =
+  const std::string& fw_output_array_name =
       AvailableArrayName(*model, "bidirectional_sequence_rnn_fw_output_0");
-  const string& bw_output_array_name =
+  const std::string& bw_output_array_name =
       AvailableArrayName(*model, "bidirectional_sequence_rnn_bw_output_0");
   model->GetOrCreateArray(fw_output_array_name);
   model->GetOrCreateArray(bw_output_array_name);
@@ -318,7 +318,7 @@ void GroupFwBwSequenceOps(Model* model, std::stack<Operator*> fw_sequence_ops,
 
 template <typename T>
 void RewireBidirectionalSequenceSequenceOpsConnections(
-    OperatorType operator_type, const string& input_array_name,
+    OperatorType operator_type, const std::string& input_array_name,
     const std::vector<T*>& bidirectional_sequence_ops,
     std::vector<std::unique_ptr<Operator>>::iterator* op_it, Model* model) {
   int aux_input_index = -1;
@@ -333,8 +333,8 @@ void RewireBidirectionalSequenceSequenceOpsConnections(
       // Should not reach here.
       DCHECK(false);
   }
-  string cur_fw_input = input_array_name;
-  string cur_bw_input = input_array_name;
+  std::string cur_fw_input = input_array_name;
+  std::string cur_bw_input = input_array_name;
   for (size_t i = 0; i < bidirectional_sequence_ops.size(); ++i) {
     DeleteArrayIfUnusedOutsideOfOp(bidirectional_sequence_ops[i]->inputs[0],
                                    bidirectional_sequence_ops[i], model);
@@ -371,8 +371,8 @@ void RewireFinalUnpackOutputs(const UnpackOperator& original_unpack_operator,
   (*final_unpack_operator)->num = original_unpack_operator.num;
 
   for (size_t i = 0; i < original_unpack_operator.outputs.size(); ++i) {
-    const string& output_array_name = original_unpack_operator.outputs[i];
-    const string& final_unpack_output_array_name = AvailableArrayName(
+    const std::string& output_array_name = original_unpack_operator.outputs[i];
+    const std::string& final_unpack_output_array_name = AvailableArrayName(
         *model, "bidirectional_sequence_unpack_" + std::to_string(i));
     model->GetOrCreateArray(final_unpack_output_array_name);
     (*final_unpack_operator)->outputs.push_back(final_unpack_output_array_name);
@@ -381,7 +381,7 @@ void RewireFinalUnpackOutputs(const UnpackOperator& original_unpack_operator,
       // If there's a following op after the unpack, it must be a concat op.
       DCHECK(unpack_following_op->type == OperatorType::kConcatenation);
       // For every output of the concat, rewire the outputs.
-      for (const string& concat_output : unpack_following_op->outputs) {
+      for (const std::string& concat_output : unpack_following_op->outputs) {
         (*final_unpack_operator)->outputs[i] = concat_output;
       }
       // Remove the concat op.
@@ -454,7 +454,7 @@ template <typename T>
                        &bidirectional_sequence_ops);
 
   // Rewire the inputs & outputs.
-  string current_input = first_fw_sequence_input->outputs[0];
+  std::string current_input = first_fw_sequence_input->outputs[0];
   RewireBidirectionalSequenceSequenceOpsConnections(
       operator_type, current_input, bidirectional_sequence_ops, &op_it, model);
 
@@ -525,7 +525,7 @@ template <typename T>
                        &bidirectional_sequence_lstm_ops);
 
   // Rewire the inputs & outputs.
-  string current_input = first_fw_lstm_input->outputs[0];
+  std::string current_input = first_fw_lstm_input->outputs[0];
   RewireBidirectionalSequenceSequenceOpsConnections(
       OperatorType::kBidirectionalSequenceLstm, current_input,
       bidirectional_sequence_lstm_ops, &op_it, model);
@@ -601,7 +601,7 @@ template <typename T>
                        &bidirectional_sequence_rnn_ops);
 
   // Rewire the inputs & outputs.
-  string current_input = first_fw_rnn_input->outputs[0];
+  std::string current_input = first_fw_rnn_input->outputs[0];
   RewireBidirectionalSequenceSequenceOpsConnections(
       OperatorType::kBidirectionalSequenceRnn, current_input,
       bidirectional_sequence_rnn_ops, &op_it, model);
diff --git a/tensorflow/lite/toco/graph_transformations/hardcode_min_max.cc b/tensorflow/lite/toco/graph_transformations/hardcode_min_max.cc
index 171d522daa7..c065e32c4df 100644
--- a/tensorflow/lite/toco/graph_transformations/hardcode_min_max.cc
+++ b/tensorflow/lite/toco/graph_transformations/hardcode_min_max.cc
@@ -279,10 +279,10 @@ bool MinMaxApproximatelyEqual(const MinMax& minmax1, const MinMax& minmax2) {
 // If multiple of these arrays have MinMax, then these are required
 // to agree with each other.
 bool PropagateMinMaxAmongArrays(Model* model,
-                                const std::vector<string>& array_names) {
-  string reference_array_name;
+                                const std::vector<std::string>& array_names) {
+  std::string reference_array_name;
   MinMax* reference_minmax = nullptr;
-  for (const string& array_name : array_names) {
+  for (const std::string& array_name : array_names) {
     if (model->GetArray(array_name).minmax) {
       reference_array_name = array_name;
       reference_minmax = model->GetArray(array_name).minmax.get();
@@ -294,7 +294,7 @@ bool PropagateMinMaxAmongArrays(Model* model,
     return false;
   }
   bool changed = false;
-  for (const string& array_name : array_names) {
+  for (const std::string& array_name : array_names) {
     auto& array = model->GetArray(array_name);
     if (array.minmax) {
       CHECK(MinMaxApproximatelyEqual(*array.minmax, *reference_minmax))
diff --git a/tensorflow/lite/toco/graph_transformations/identify_dilated_conv.cc b/tensorflow/lite/toco/graph_transformations/identify_dilated_conv.cc
index 1940068d32a..2ba39e74d3a 100644
--- a/tensorflow/lite/toco/graph_transformations/identify_dilated_conv.cc
+++ b/tensorflow/lite/toco/graph_transformations/identify_dilated_conv.cc
@@ -206,7 +206,7 @@ bool ResolveDilatedConv(Model* model, Operator* conv_base_op, Operator* stb_op,
   }
 
   // Conv Op
-  const string& input_of_conv_op =
+  const std::string& input_of_conv_op =
       has_expand_op ? post_stb_op->outputs[0] : stb_op->outputs[0];
   auto* conv_base_op = GetOpWithInput(*model, input_of_conv_op);
   bool changed = false;
diff --git a/tensorflow/lite/toco/graph_transformations/identify_hardswish.cc b/tensorflow/lite/toco/graph_transformations/identify_hardswish.cc
index 00758a22177..5bc49899e0b 100644
--- a/tensorflow/lite/toco/graph_transformations/identify_hardswish.cc
+++ b/tensorflow/lite/toco/graph_transformations/identify_hardswish.cc
@@ -78,7 +78,7 @@ using util::IsBinaryOp;
   // 1. non-constant input of add_with_relu6_op
   // 2. 1/6
   // 3. (and add_with_relu6_op[0].outputs[0] - which we already know!)
-  std::vector<string> mul_inputs = mul_op->inputs;
+  std::vector<std::string> mul_inputs = mul_op->inputs;
   mul_inputs.insert(mul_inputs.end(), output_op->inputs.begin(),
                     output_op->inputs.end());
 
diff --git a/tensorflow/lite/toco/graph_transformations/identify_lstm.cc b/tensorflow/lite/toco/graph_transformations/identify_lstm.cc
index 43ce90a0444..14f81779147 100644
--- a/tensorflow/lite/toco/graph_transformations/identify_lstm.cc
+++ b/tensorflow/lite/toco/graph_transformations/identify_lstm.cc
@@ -35,7 +35,7 @@ std::vector<std::unique_ptr<Operator>>::iterator FindOperator(
   return it;
 }
 
-bool ValidateSourceOp(const Model& model, const string& array_name,
+bool ValidateSourceOp(const Model& model, const std::string& array_name,
                       OperatorType op_type, Operator** source_op) {
   if (op_type == OperatorType::kNone) {
     CHECK(!source_op);
@@ -184,7 +184,7 @@ bool MatchOperatorInputs(const Operator& op, const Model& model,
                            &state_remember_mul)) {
     return ::tensorflow::Status::OK();
   }
-  const string prev_state = state_forget_mul->inputs[0];
+  const std::string prev_state = state_forget_mul->inputs[0];
 
   // State forget gate
   Operator* state_forget_sig;
@@ -271,16 +271,16 @@ bool MatchOperatorInputs(const Operator& op, const Model& model,
               LogName(*lstm_cell_op));
 
   // Create temp arrays used internally during runtime.
-  const string base_name(FindLongestCommonPrefix(
+  const std::string base_name(FindLongestCommonPrefix(
       lstm_cell_op->outputs[LstmCellOperator::STATE_OUTPUT],
       lstm_cell_op->outputs[LstmCellOperator::ACTIV_OUTPUT]));
-  const string& concat_temp_array_name =
+  const std::string& concat_temp_array_name =
       AvailableArrayName(*model, base_name + "concat_temp");
   auto& concat_temp_array = model->GetOrCreateArray(concat_temp_array_name);
   concat_temp_array.data_type =
       model->GetArray(concat_inputs->outputs[0]).data_type;
   lstm_cell_op->outputs[LstmCellOperator::CONCAT_TEMP] = concat_temp_array_name;
-  const string& activ_temp_array_name =
+  const std::string& activ_temp_array_name =
       AvailableArrayName(*model, base_name + "activ_temp");
   auto& activ_temp_array = model->GetOrCreateArray(activ_temp_array_name);
   activ_temp_array.data_type =
diff --git a/tensorflow/lite/toco/graph_transformations/identify_lstm_merge_inputs.cc b/tensorflow/lite/toco/graph_transformations/identify_lstm_merge_inputs.cc
index 2ac1d380813..cfa5f879f44 100644
--- a/tensorflow/lite/toco/graph_transformations/identify_lstm_merge_inputs.cc
+++ b/tensorflow/lite/toco/graph_transformations/identify_lstm_merge_inputs.cc
@@ -45,12 +45,12 @@ namespace toco {
 
   // Identify prev_activ_input, prev_state_input as required Op inputs,
   // using the rnn_states in the model flag.
-  string prev_activ_input;
+  std::string prev_activ_input;
   if (!GetMatchingRnnArray(model, src_op->outputs[kOutputTensor],
                            &prev_activ_input)) {
     return ::tensorflow::Status::OK();
   }
-  string prev_state_input;
+  std::string prev_state_input;
   if (!GetMatchingRnnArray(model, src_op->outputs[kCellStateTensor],
                            &prev_state_input)) {
     return ::tensorflow::Status::OK();
@@ -72,9 +72,10 @@ namespace toco {
   CHECK_EQ(num_cell, num_output);
 
   // Create tensorflow_graphdef style's one big weight tensor.
-  const string base_name(FindLongestCommonPrefix(
+  const std::string base_name(FindLongestCommonPrefix(
       src_op->outputs[kOutputTensor], src_op->outputs[kCellStateTensor]));
-  string merged_weights = AvailableArrayName(*model, base_name + "weights");
+  std::string merged_weights =
+      AvailableArrayName(*model, base_name + "weights");
   auto& array = model->GetOrCreateArray(merged_weights);
   array.data_type = ArrayDataType::kFloat;
   int weights_dim1 = 4 * num_cell;
@@ -117,7 +118,7 @@ namespace toco {
       num_cell * 3, num_input);
 
   // Create tensorflow_graphdef style's one big bias tensor.
-  string merged_biases = AvailableArrayName(*model, base_name + "biases");
+  std::string merged_biases = AvailableArrayName(*model, base_name + "biases");
   auto& bias_array = model->GetOrCreateArray(merged_biases);
   bias_array.data_type = ArrayDataType::kFloat;
   bias_array.copy_shape(Shape({weights_dim1}));
@@ -160,7 +161,7 @@ namespace toco {
   lstm_cell_op->outputs[LstmCellOperator::ACTIV_TEMP] =
       src_op->outputs[kOutputStateTensor];
   // Create a new temp array for the fourth output.
-  const string& concat_temp_array_name =
+  const std::string& concat_temp_array_name =
       AvailableArrayName(*model, base_name + "concat_temp");
   model->GetOrCreateArray(concat_temp_array_name);
   lstm_cell_op->outputs[LstmCellOperator::CONCAT_TEMP] = concat_temp_array_name;
diff --git a/tensorflow/lite/toco/graph_transformations/identify_lstm_split_inputs.cc b/tensorflow/lite/toco/graph_transformations/identify_lstm_split_inputs.cc
index 62f4124fb4e..8359534435a 100644
--- a/tensorflow/lite/toco/graph_transformations/identify_lstm_split_inputs.cc
+++ b/tensorflow/lite/toco/graph_transformations/identify_lstm_split_inputs.cc
@@ -86,7 +86,7 @@ namespace toco {
   // Get original weight tensor and decompose 1 tensor to 8 sub tensors.
   Array& kernel =
       model->GetArray(curr_op->inputs[LstmCellOperator::WEIGHTS_INPUT]);
-  const string base_name(FindLongestCommonPrefix(
+  const std::string base_name(FindLongestCommonPrefix(
       curr_op->outputs[LstmCellOperator::ACTIV_OUTPUT],
       curr_op->outputs[LstmCellOperator::STATE_OUTPUT]));
 
diff --git a/tensorflow/lite/toco/graph_transformations/lstm_utils.cc b/tensorflow/lite/toco/graph_transformations/lstm_utils.cc
index 3414a7fd7fe..a8b69205c0d 100644
--- a/tensorflow/lite/toco/graph_transformations/lstm_utils.cc
+++ b/tensorflow/lite/toco/graph_transformations/lstm_utils.cc
@@ -16,8 +16,8 @@ limitations under the License.
 
 namespace toco {
 
-void CreateOptionalArray(Model* model, string* input_array_buffer,
-                         const string& array_name) {
+void CreateOptionalArray(Model* model, std::string* input_array_buffer,
+                         const std::string& array_name) {
   *input_array_buffer = array_name;
   model->CreateOptionalArray(array_name);
 }
@@ -39,7 +39,7 @@ void CopyArrayData(const Buffer<ArrayDataType::kFloat>& src_buffer,
 }
 
 Buffer<ArrayDataType::kFloat>* CreateFloatArrayBuffer(Model* model,
-                                                      string* array_name,
+                                                      std::string* array_name,
                                                       const Shape& shape) {
   *array_name = AvailableArrayName(*model, *array_name);
   auto& array = model->GetOrCreateArray(*array_name);
@@ -51,8 +51,8 @@ Buffer<ArrayDataType::kFloat>* CreateFloatArrayBuffer(Model* model,
   return buffer;
 }
 
-void CopySubArrayToArray(Model* model, string* array_name,
-                         const string& tensor_name, int dim1_size,
+void CopySubArrayToArray(Model* model, std::string* array_name,
+                         const std::string& tensor_name, int dim1_size,
                          int dim2_size, const Array& original_array,
                          int start_idx1, int start_idx2) {
   // Determine whether it's bias or not, create shape, buffer.
@@ -83,8 +83,9 @@ void CopyArrayToSubArray(Buffer<ArrayDataType::kFloat>& tensor_buffer,
                 dim1_copy_size, dim2_copy_size);
 }
 
-bool GetMatchingRnnArray(Model* model, const string& back_edge_source_array,
-                         string* rnn_array) {
+bool GetMatchingRnnArray(Model* model,
+                         const std::string& back_edge_source_array,
+                         std::string* rnn_array) {
   for (const auto& rnn_state : model->flags.rnn_states()) {
     if (rnn_state.back_edge_source_array() == back_edge_source_array) {
       *rnn_array = rnn_state.state_array();
diff --git a/tensorflow/lite/toco/graph_transformations/lstm_utils.h b/tensorflow/lite/toco/graph_transformations/lstm_utils.h
index 949292ee84b..102fe7d6cfc 100644
--- a/tensorflow/lite/toco/graph_transformations/lstm_utils.h
+++ b/tensorflow/lite/toco/graph_transformations/lstm_utils.h
@@ -62,12 +62,12 @@ enum ExtendedLstmCellOutputs {
 };
 
 // Create optional array used for optional tensor in ExtendedLstmCell inputs.
-void CreateOptionalArray(Model* model, string* input_array_buffer,
-                         const string& array_name);
+void CreateOptionalArray(Model* model, std::string* input_array_buffer,
+                         const std::string& array_name);
 
 // Create float array and get its buffer.
 Buffer<ArrayDataType::kFloat>* CreateFloatArrayBuffer(Model* model,
-                                                      string* array_name,
+                                                      std::string* array_name,
                                                       const Shape& shape);
 
 // Copy data from one array to the other one (supports 1D and 2D array),
@@ -91,8 +91,8 @@ void CopyArrayData(const Buffer<ArrayDataType::kFloat>& src_buffer,
 
 // Copy a subset of array data and create a smaller array,
 // mostly used for spliting weights and bias for Lstm cell.
-void CopySubArrayToArray(Model* model, string* array_name,
-                         const string& tensor_name, int dim1_size,
+void CopySubArrayToArray(Model* model, std::string* array_name,
+                         const std::string& tensor_name, int dim1_size,
                          int dim2_size, const Array& original_array,
                          int start_idx1, int start_idx2);
 
@@ -103,8 +103,9 @@ void CopyArrayToSubArray(Buffer<ArrayDataType::kFloat>& tensor_buffer,
                          int start_idx1, int start_idx2);
 
 // Get mating rnn array inputs using rnn_states flag.
-bool GetMatchingRnnArray(Model* model, const string& back_edge_source_array,
-                         string* rnn_array);
+bool GetMatchingRnnArray(Model* model,
+                         const std::string& back_edge_source_array,
+                         std::string* rnn_array);
 
 }  // namespace toco
 
diff --git a/tensorflow/lite/toco/graph_transformations/make_initial_dequantize_operator.cc b/tensorflow/lite/toco/graph_transformations/make_initial_dequantize_operator.cc
index b914838b91c..7783b41767c 100644
--- a/tensorflow/lite/toco/graph_transformations/make_initial_dequantize_operator.cc
+++ b/tensorflow/lite/toco/graph_transformations/make_initial_dequantize_operator.cc
@@ -31,7 +31,8 @@ namespace toco {
 // generate this output to be removed by graph transformations.  Note that there
 // may be more than one operator that takes the input_array as their input, and
 // that some of these may be removed by graph transformations.
-bool AddDequantizeOperatorToInput(const string& input_name, const Operator* op,
+bool AddDequantizeOperatorToInput(const std::string& input_name,
+                                  const Operator* op,
                                   GraphTransformation* transformation,
                                   Model* model) {
   // An operator with the required output may be a dequantize operator already
@@ -65,7 +66,7 @@ bool AddDequantizeOperatorToInput(const string& input_name, const Operator* op,
   const auto& dequantized_input_name =
       AvailableArrayName(*model, input_name + "_dequantized");
   for (auto& other_op : model->operators) {
-    for (string& other_op_input : other_op->inputs) {
+    for (std::string& other_op_input : other_op->inputs) {
       if (other_op_input == input_name) {
         other_op_input = dequantized_input_name;
       }
diff --git a/tensorflow/lite/toco/graph_transformations/merge_reshape_into_preceding_transpose.cc b/tensorflow/lite/toco/graph_transformations/merge_reshape_into_preceding_transpose.cc
index 80170fe8bcb..96ccc22d9e9 100644
--- a/tensorflow/lite/toco/graph_transformations/merge_reshape_into_preceding_transpose.cc
+++ b/tensorflow/lite/toco/graph_transformations/merge_reshape_into_preceding_transpose.cc
@@ -117,8 +117,8 @@ std::vector<int32> ReshapeToTranspose(const Model& model,
     return ::tensorflow::Status::OK();
   }
 
-  const string intermediate_name = reshape_op->inputs[0];
-  const string output_name = reshape_op->outputs[0];
+  const std::string intermediate_name = reshape_op->inputs[0];
+  const std::string output_name = reshape_op->outputs[0];
 
   // Guarantee the input is only consume by the reshape.
   if (CountOpsWithInput(*model, intermediate_name) != 1) {
diff --git a/tensorflow/lite/toco/graph_transformations/move_binary_operator_before_reshape.cc b/tensorflow/lite/toco/graph_transformations/move_binary_operator_before_reshape.cc
index 0f3c4d34d66..222d1fd1e08 100644
--- a/tensorflow/lite/toco/graph_transformations/move_binary_operator_before_reshape.cc
+++ b/tensorflow/lite/toco/graph_transformations/move_binary_operator_before_reshape.cc
@@ -141,7 +141,7 @@ bool IsTailOfShape(const Shape& tail, const Shape& shape) {
   }
 
   // EXTRA CHECKS ON CONNECTING ARRAY
-  for (const string& output_array : model->flags.output_arrays()) {
+  for (const std::string& output_array : model->flags.output_arrays()) {
     if (binary_op->inputs[variable_input_idx] == output_array) {
       AddMessageF(
           "Not moving %s because the output of reshape op %s is an output op.",
diff --git a/tensorflow/lite/toco/graph_transformations/propagate_activation_function_into_constants.cc b/tensorflow/lite/toco/graph_transformations/propagate_activation_function_into_constants.cc
index 95de60262e7..a66a4cd0124 100644
--- a/tensorflow/lite/toco/graph_transformations/propagate_activation_function_into_constants.cc
+++ b/tensorflow/lite/toco/graph_transformations/propagate_activation_function_into_constants.cc
@@ -52,7 +52,7 @@ namespace toco {
   }
 
   // Filter to the list of supported ops.
-  string src_op_input;
+  std::string src_op_input;
   switch (src_op->type) {
     case OperatorType::kGather:
       src_op_input = src_op->inputs[0];
diff --git a/tensorflow/lite/toco/graph_transformations/propagate_array_data_types.cc b/tensorflow/lite/toco/graph_transformations/propagate_array_data_types.cc
index 49d59de860b..5eda1950745 100644
--- a/tensorflow/lite/toco/graph_transformations/propagate_array_data_types.cc
+++ b/tensorflow/lite/toco/graph_transformations/propagate_array_data_types.cc
@@ -48,7 +48,7 @@ void SetDataTypeForAllOutputs(Model* model, Operator* op,
   }
   // Record data types of output before processing, so we can see at the
   // end if we changed anything, and return the correct boolean value.
-  std::unordered_map<string, ArrayDataType> old_output_data_types;
+  std::unordered_map<std::string, ArrayDataType> old_output_data_types;
   for (const auto& output : op->outputs) {
     old_output_data_types[output] = model->GetArray(output).data_type;
   }
@@ -171,7 +171,7 @@ void SetDataTypeForAllOutputs(Model* model, Operator* op,
         return ::tensorflow::Status::OK();
       }
       for (int i = 0; i < op->outputs.size(); ++i) {
-        const string& output = op->outputs[i];
+        const std::string& output = op->outputs[i];
         const ArrayDataType data_type = unsupported_op->output_data_types[i];
         model->GetArray(output).data_type = data_type;
       }
diff --git a/tensorflow/lite/toco/graph_transformations/propagate_default_min_max.cc b/tensorflow/lite/toco/graph_transformations/propagate_default_min_max.cc
index d31ba956afd..bf1109ddba5 100644
--- a/tensorflow/lite/toco/graph_transformations/propagate_default_min_max.cc
+++ b/tensorflow/lite/toco/graph_transformations/propagate_default_min_max.cc
@@ -70,7 +70,7 @@ bool SupportsMinMax(const Array& array) {
 
 // Sets the min/max on the given array, adjusting the reference_minmax for the
 // final data type of the array if it is already specified.
-bool PropagateDefaultMinMax::SetArrayMinMax(const string& array_name,
+bool PropagateDefaultMinMax::SetArrayMinMax(const std::string& array_name,
                                             Array* array) {
   CHECK(!array->minmax);
 
diff --git a/tensorflow/lite/toco/graph_transformations/propagate_fixed_sizes.cc b/tensorflow/lite/toco/graph_transformations/propagate_fixed_sizes.cc
index 006e624eb7a..1524cfe7f35 100644
--- a/tensorflow/lite/toco/graph_transformations/propagate_fixed_sizes.cc
+++ b/tensorflow/lite/toco/graph_transformations/propagate_fixed_sizes.cc
@@ -268,7 +268,7 @@ void ProcessDepthwiseConvOperator(Model* model, DepthwiseConvOperator* op) {
   const auto& weights_shape = weights_array.shape();
   CHECK_EQ(weights_shape.dimensions_count(), 4);
 
-  const string& output_name = op->outputs[0];
+  const std::string& output_name = op->outputs[0];
   const int input_depth = input_shape.dims(3);
   const int output_depth = weights_shape.dims(3);
   // TensorFlow doesn't define the depth_multiplier value on DepthwiseConv ops,
@@ -302,7 +302,7 @@ void ProcessDepthToSpaceOperator(Model* model, DepthToSpaceOperator* op) {
   const auto& input_shape = input_array.shape();
   CHECK_EQ(input_shape.dimensions_count(), 4);
 
-  const string& output_name = op->outputs[0];
+  const std::string& output_name = op->outputs[0];
   const int block_size = op->block_size;
   CHECK_NE(block_size, 0) << "Invalid block_size in " << output_name;
   const int batch = input_shape.dims(0);
@@ -325,7 +325,7 @@ void ProcessSpaceToDepthOperator(Model* model, SpaceToDepthOperator* op) {
   const auto& input_shape = input_array.shape();
   CHECK_EQ(input_shape.dimensions_count(), 4);
 
-  const string& output_name = op->outputs[0];
+  const std::string& output_name = op->outputs[0];
   const int block_size = op->block_size;
   CHECK_NE(block_size, 0) << "Invalid block_size in " << output_name;
   const int batch = input_shape.dims(0);
@@ -470,7 +470,7 @@ void ProcessSimpleOperator(Model* model, Operator* op, int input_index) {
     return;
   }
 
-  const string& output_name = op->outputs[0];
+  const std::string& output_name = op->outputs[0];
   auto& output_array = model->GetArray(output_name);
   if (output_array.has_shape()) {
     return;
@@ -487,7 +487,7 @@ void ProcessSimpleBinaryOperator(Model* model, Operator* op) {
   if (!input0_array.has_shape() || !input1_array.has_shape()) {
     return;
   }
-  const string& output_name = op->outputs[0];
+  const std::string& output_name = op->outputs[0];
   auto& output_array = model->GetArray(output_name);
   ComputeBinaryOperatorOutputSize(input0_array.shape(), input1_array.shape(),
                                   &output_array);
@@ -639,14 +639,14 @@ void ProcessSliceOperator(Model* model, SliceOperator* op) {
 }
 
 void ProcessReorderAxesOperator(Model* model, ReorderAxesOperator* op) {
-  const string& input_name = op->inputs[0];
+  const std::string& input_name = op->inputs[0];
   const auto& input_array = model->GetArray(input_name);
   // Yield until input dims have been resolved.
   if (!input_array.has_shape()) {
     return;
   }
   const auto& input_shape = input_array.shape();
-  const string& output_name = op->outputs[0];
+  const std::string& output_name = op->outputs[0];
   Shape* output_shape = model->GetArray(output_name).mutable_shape();
   ShuffleDims(input_shape, op->input_axes_order, op->output_axes_order,
               output_shape);
@@ -757,7 +757,7 @@ void ProcessRangeOperator(Model* model, RangeOperator* op) {
 
 void ProcessTensorFlowSplitOperator(Model* model, TensorFlowSplitOperator* op) {
   CHECK_EQ(op->inputs.size(), 2);
-  const string& input_name = op->inputs[1];
+  const std::string& input_name = op->inputs[1];
   const auto& input_array = model->GetArray(input_name);
   // Yield until input dims have been resolved.
   if (!input_array.has_shape()) {
@@ -892,7 +892,7 @@ void ProcessTensorFlowSplitVOperator(Model* model,
 }
 
 void ProcessAveragePoolOperator(Model* model, AveragePoolOperator* op) {
-  const string& input_name = op->inputs[0];
+  const std::string& input_name = op->inputs[0];
   const auto& input_array = model->GetArray(input_name);
   // Yield until input dims have been resolved.
   if (!input_array.has_shape()) {
@@ -900,7 +900,7 @@ void ProcessAveragePoolOperator(Model* model, AveragePoolOperator* op) {
   }
   const auto& input_shape = input_array.shape();
   CHECK_EQ(input_shape.dimensions_count(), 4);
-  const string& output_name = op->outputs[0];
+  const std::string& output_name = op->outputs[0];
   const int output_depth = input_shape.dims(3);
   ComputeConvSizes(input_shape, output_depth, op->kwidth, op->kheight,
                    op->stride_width, op->stride_height, 1, 1, op->padding.type,
@@ -909,7 +909,7 @@ void ProcessAveragePoolOperator(Model* model, AveragePoolOperator* op) {
 }
 
 void ProcessMaxPoolOperator(Model* model, MaxPoolOperator* op) {
-  const string& input_name = op->inputs[0];
+  const std::string& input_name = op->inputs[0];
   const auto& input_array = model->GetArray(input_name);
   // Yield until input dims have been resolved.
   if (!input_array.has_shape()) {
@@ -917,7 +917,7 @@ void ProcessMaxPoolOperator(Model* model, MaxPoolOperator* op) {
   }
   const auto& input_shape = input_array.shape();
   CHECK_EQ(input_shape.dimensions_count(), 4);
-  const string& output_name = op->outputs[0];
+  const std::string& output_name = op->outputs[0];
   const int output_depth = input_shape.dims(3);
   ComputeConvSizes(input_shape, output_depth, op->kwidth, op->kheight,
                    op->stride_width, op->stride_height, 1, 1, op->padding.type,
@@ -926,7 +926,7 @@ void ProcessMaxPoolOperator(Model* model, MaxPoolOperator* op) {
 }
 
 void ProcessL2PoolOperator(Model* model, L2PoolOperator* op) {
-  const string& input_name = op->inputs[0];
+  const std::string& input_name = op->inputs[0];
   const auto& input_array = model->GetArray(input_name);
   // Yield until input dims have been resolved.
   if (!input_array.has_shape()) {
@@ -936,7 +936,7 @@ void ProcessL2PoolOperator(Model* model, L2PoolOperator* op) {
   if (input_shape.dimensions_count() < 4) {
     LOG(FATAL) << "missing dimensions for " << input_name;
   }
-  const string& output_name = op->outputs[0];
+  const std::string& output_name = op->outputs[0];
   const int output_depth = input_shape.dims(3);
   ComputeConvSizes(input_shape, output_depth, op->kwidth, op->kheight,
                    op->stride_width, op->stride_height, 1, 1, op->padding.type,
@@ -954,7 +954,7 @@ void ProcessResizeBilinearOperator(Model* model, ResizeBilinearOperator* op) {
   }
   const auto& input_data_shape = model->GetArray(op->inputs[0]).shape();
 
-  const string& output_size_name = op->inputs[1];
+  const std::string& output_size_name = op->inputs[1];
   const auto& output_size_array = model->GetArray(output_size_name);
   CHECK(output_size_array.data_type == ArrayDataType::kInt32);
   CHECK(output_size_array.has_shape());
@@ -982,7 +982,7 @@ void ProcessResizeNearestNeighborOperator(Model* model,
   }
   const auto& input_data_shape = model->GetArray(op->inputs[0]).shape();
 
-  const string& output_size_name = op->inputs[1];
+  const std::string& output_size_name = op->inputs[1];
   const auto& output_size_array = model->GetArray(output_size_name);
   CHECK(output_size_array.data_type == ArrayDataType::kInt32);
   CHECK(output_size_array.has_shape());
@@ -1862,7 +1862,7 @@ void ProcessArgMinMaxOperator(Model* model, Op* op) {
     }
   }
 
-  const string& output_name = op->outputs[0];
+  const std::string& output_name = op->outputs[0];
   auto& output_array = model->GetArray(output_name);
   if (output_array.has_shape()) {
     return;
@@ -1880,7 +1880,7 @@ void ProcessSparseToDenseOperator(Model* model, SparseToDenseOperator* op) {
   // Output should not go over four dimensions.
   CHECK_LE(output_shape_array.shape().dims(0), 4);
 
-  const string& output_name = op->outputs[0];
+  const std::string& output_name = op->outputs[0];
   Array& output_array = model->GetArray(output_name);
   if (output_array.has_shape()) return;
 
@@ -2015,7 +2015,7 @@ void ProcessUnpackOperator(Model* model, UnpackOperator* op) {
       output_dims.push_back(input_dims[i]);
     }
   }
-  for (const string& output_name : op->outputs) {
+  for (const std::string& output_name : op->outputs) {
     auto& output_array = model->GetArray(output_name);
     if (output_array.has_shape()) {
       return;
@@ -2149,7 +2149,7 @@ void ProcessScatterNdOperator(Model* model, ScatterNdOperator* op) {
   *modified = false;
   auto it = model->operators.begin() + op_index;
   auto* op = it->get();
-  std::unordered_map<string, std::vector<int>> old_output_dims;
+  std::unordered_map<std::string, std::vector<int>> old_output_dims;
   for (const auto& output : op->outputs) {
     if (model->GetArray(output).has_shape()) {
       old_output_dims[output] = model->GetArray(output).shape().dims();
@@ -2400,7 +2400,7 @@ void ProcessScatterNdOperator(Model* model, ScatterNdOperator* op) {
         return ::tensorflow::Status::OK();
       }
       for (int i = 0; i < op->outputs.size(); ++i) {
-        const string& output = op->outputs[i];
+        const std::string& output = op->outputs[i];
         model->GetArray(output).copy_shape(unsupported_op->output_shapes.at(i));
       }
       break;
diff --git a/tensorflow/lite/toco/graph_transformations/quantization_util.cc b/tensorflow/lite/toco/graph_transformations/quantization_util.cc
index 23749abf0b1..76ead658107 100644
--- a/tensorflow/lite/toco/graph_transformations/quantization_util.cc
+++ b/tensorflow/lite/toco/graph_transformations/quantization_util.cc
@@ -164,7 +164,7 @@ std::unique_ptr<GenericBuffer> QuantizeBuffer(
 
 template <ArrayDataType A>
 void QuantizeArray(GraphTransformation* transformation, Model* model,
-                   const string& name,
+                   const std::string& name,
                    const QuantizationParams& quantization_params) {
   auto& array = model->GetArray(name);
   CHECK(array.data_type == ArrayDataType::kFloat);
@@ -184,7 +184,7 @@ void QuantizeArray(GraphTransformation* transformation, Model* model,
 }  // namespace
 
 void QuantizeArray(GraphTransformation* transformation, Model* model,
-                   const string& name, ArrayDataType quantized_data_type,
+                   const std::string& name, ArrayDataType quantized_data_type,
                    const QuantizationParams& quantization_params) {
   ArrayDataType adjusted_data_type = quantized_data_type;
   auto& array = model->GetArray(name);
diff --git a/tensorflow/lite/toco/graph_transformations/quantization_util.h b/tensorflow/lite/toco/graph_transformations/quantization_util.h
index d226aeab8b7..d1d72b98f9e 100644
--- a/tensorflow/lite/toco/graph_transformations/quantization_util.h
+++ b/tensorflow/lite/toco/graph_transformations/quantization_util.h
@@ -47,7 +47,7 @@ void ChooseQuantizationParamsForArrayAndQuantizedDataType(
 // Quantizes an array by setting its data type and (if constant) quantizing
 // all values in the array.
 void QuantizeArray(GraphTransformation* transformation, Model* model,
-                   const string& name, ArrayDataType quantized_data_type,
+                   const std::string& name, ArrayDataType quantized_data_type,
                    const QuantizationParams& quantization_params);
 
 // Returns true if the given array, when quantized, contains only values between
diff --git a/tensorflow/lite/toco/graph_transformations/quantize.cc b/tensorflow/lite/toco/graph_transformations/quantize.cc
index e6fd88c9787..c5848f83dd3 100644
--- a/tensorflow/lite/toco/graph_transformations/quantize.cc
+++ b/tensorflow/lite/toco/graph_transformations/quantize.cc
@@ -121,7 +121,7 @@ bool SupportOutputTypeFloatInQuantizedOp(const Operator& op) {
   }
   return false;
 }
-const MinMax& GetOrComputeMinMax(Model* model, const string& array_name) {
+const MinMax& GetOrComputeMinMax(Model* model, const std::string& array_name) {
   auto& array = model->GetArray(array_name);
   // Normally we should have a MinMax recorded on this Array,
   // so we just use it.
diff --git a/tensorflow/lite/toco/graph_transformations/read_array_minmax_and_narrow_range_from_fake_quant.cc b/tensorflow/lite/toco/graph_transformations/read_array_minmax_and_narrow_range_from_fake_quant.cc
index 4d621018dc3..30875c7e59e 100644
--- a/tensorflow/lite/toco/graph_transformations/read_array_minmax_and_narrow_range_from_fake_quant.cc
+++ b/tensorflow/lite/toco/graph_transformations/read_array_minmax_and_narrow_range_from_fake_quant.cc
@@ -29,7 +29,7 @@ namespace {
 
 bool ApplyAttrsToArray(GraphTransformation* transformation, Model* model,
                        const FakeQuantOperator& fq_op,
-                       const string& array_name) {
+                       const std::string& array_name) {
   bool changed = false;
   auto& annotated_array = model->GetArray(array_name);
   if (!annotated_array.minmax) {
diff --git a/tensorflow/lite/toco/graph_transformations/remove_successive_transpose.cc b/tensorflow/lite/toco/graph_transformations/remove_successive_transpose.cc
index 6eccda04c18..5e3b6f7b615 100644
--- a/tensorflow/lite/toco/graph_transformations/remove_successive_transpose.cc
+++ b/tensorflow/lite/toco/graph_transformations/remove_successive_transpose.cc
@@ -43,8 +43,8 @@ bool TransformsToIdentity(std::vector<int> const& perm1,
   return true;
 }
 
-void ReplaceOpInputsWith(Model* model, const string& lookfor,
-                         const string& replacewith) {
+void ReplaceOpInputsWith(Model* model, const std::string& lookfor,
+                         const std::string& replacewith) {
   for (const auto& op : model->operators) {
     for (int i = 0; i < op->inputs.size(); ++i) {
       if (op->inputs[i] == lookfor) {
diff --git a/tensorflow/lite/toco/graph_transformations/remove_trivial_concatenation_input.cc b/tensorflow/lite/toco/graph_transformations/remove_trivial_concatenation_input.cc
index df4a4ea51c4..dc210fc03d8 100644
--- a/tensorflow/lite/toco/graph_transformations/remove_trivial_concatenation_input.cc
+++ b/tensorflow/lite/toco/graph_transformations/remove_trivial_concatenation_input.cc
@@ -41,9 +41,9 @@ namespace toco {
   if (concat_op->type != OperatorType::kConcatenation) {
     return ::tensorflow::Status::OK();
   }
-  std::vector<string> trivial_inputs;
-  std::vector<string> nontrivial_inputs;
-  for (const string& input : concat_op->inputs) {
+  std::vector<std::string> trivial_inputs;
+  std::vector<std::string> nontrivial_inputs;
+  for (const std::string& input : concat_op->inputs) {
     const auto& input_array = model->GetArray(input);
     const bool is_trivial =
         input_array.has_shape() && input_array.shape().dimensions_count() == 0;
@@ -60,7 +60,7 @@ namespace toco {
 
   // Drop trivial inputs.
   concat_op->inputs = nontrivial_inputs;
-  for (const string& input : trivial_inputs) {
+  for (const std::string& input : trivial_inputs) {
     DeleteArrayIfUnusedOutsideOfOp(input, concat_op, model);
   }
   *modified = true;
diff --git a/tensorflow/lite/toco/graph_transformations/remove_trivial_passthrough.cc b/tensorflow/lite/toco/graph_transformations/remove_trivial_passthrough.cc
index bd529bd9ecd..45dbec83471 100644
--- a/tensorflow/lite/toco/graph_transformations/remove_trivial_passthrough.cc
+++ b/tensorflow/lite/toco/graph_transformations/remove_trivial_passthrough.cc
@@ -29,7 +29,7 @@ namespace {
 // array instead. from_array is assumed to be discardable, and consequently
 // this only updates operator edges (since discardable arrays only
 // appear there, and not e.g. in model flags).
-void Reroute(const string& from, const string& to, Model* model) {
+void Reroute(const std::string& from, const std::string& to, Model* model) {
   for (const auto& op : model->operators) {
     for (auto& output : op->outputs) {
       if (output == from) {
@@ -92,8 +92,9 @@ bool RemoveTrivialPassthroughOp(GraphTransformation* transformation,
     }
   }
 
-  const string main_input_name = passthru_op->inputs[main_input_array_index];
-  const string output_name = passthru_op->outputs[0];
+  const std::string main_input_name =
+      passthru_op->inputs[main_input_array_index];
+  const std::string output_name = passthru_op->outputs[0];
 
   if (IsDiscardableArray(*model, output_name)) {
     transformation->AddMessageF(
diff --git a/tensorflow/lite/toco/graph_transformations/remove_trivial_quantized_activation_func.cc b/tensorflow/lite/toco/graph_transformations/remove_trivial_quantized_activation_func.cc
index 56acf22f7f1..80d28e0fc6d 100644
--- a/tensorflow/lite/toco/graph_transformations/remove_trivial_quantized_activation_func.cc
+++ b/tensorflow/lite/toco/graph_transformations/remove_trivial_quantized_activation_func.cc
@@ -32,7 +32,7 @@ namespace {
 
 bool IsTrivialUnfusedActivationFunc(GraphTransformation* transformation,
                                     const Model& model, OperatorType op_type,
-                                    const string& input_array_name) {
+                                    const std::string& input_array_name) {
   double clamp_min;
   double clamp_max;
   switch (op_type) {
@@ -60,7 +60,7 @@ bool IsTrivialUnfusedActivationFunc(GraphTransformation* transformation,
 bool IsTrivialFusedActivationFunc(
     GraphTransformation* transformation, const Model& model,
     FusedActivationFunctionType activation_function,
-    const string& output_array_name) {
+    const std::string& output_array_name) {
   double clamp_min;
   double clamp_max;
   switch (activation_function) {
diff --git a/tensorflow/lite/toco/graph_transformations/remove_trivial_quantized_min_max.cc b/tensorflow/lite/toco/graph_transformations/remove_trivial_quantized_min_max.cc
index f1037994c97..bd9281fe34e 100644
--- a/tensorflow/lite/toco/graph_transformations/remove_trivial_quantized_min_max.cc
+++ b/tensorflow/lite/toco/graph_transformations/remove_trivial_quantized_min_max.cc
@@ -31,8 +31,8 @@ namespace toco {
 namespace {
 
 bool IsTrivialMinMax(GraphTransformation* transformation, const Model& model,
-                     OperatorType op_type, const string& input_array_name,
-                     const string& clamp_value_array_name) {
+                     OperatorType op_type, const std::string& input_array_name,
+                     const std::string& clamp_value_array_name) {
   const auto& clamp_value_array = model.GetArray(clamp_value_array_name);
   if (!IsConstantParameterArray(model, clamp_value_array_name)) {
     transformation->AddMessageF("Clip value array %s is non-constant",
diff --git a/tensorflow/lite/toco/graph_transformations/remove_unused_op.cc b/tensorflow/lite/toco/graph_transformations/remove_unused_op.cc
index 384b5f22911..bcdb4cbe77e 100644
--- a/tensorflow/lite/toco/graph_transformations/remove_unused_op.cc
+++ b/tensorflow/lite/toco/graph_transformations/remove_unused_op.cc
@@ -58,7 +58,7 @@ namespace toco {
     if (found_output_as_rnn_state_array) {
       continue;
     }
-    for (const string& output_array : model->flags.output_arrays()) {
+    for (const std::string& output_array : model->flags.output_arrays()) {
       if (output == output_array) {
         return ::tensorflow::Status::OK();
       }
diff --git a/tensorflow/lite/toco/graph_transformations/reorder_elementwise_unary.cc b/tensorflow/lite/toco/graph_transformations/reorder_elementwise_unary.cc
index 17a5e9a1d6a..158c7f95085 100644
--- a/tensorflow/lite/toco/graph_transformations/reorder_elementwise_unary.cc
+++ b/tensorflow/lite/toco/graph_transformations/reorder_elementwise_unary.cc
@@ -75,7 +75,7 @@ bool IsMoveOperator(OperatorType optype) {
     return ::tensorflow::Status::OK();
   }
 
-  const string intermediate_name = element_op->inputs[0];
+  const std::string intermediate_name = element_op->inputs[0];
   auto it = FindOpWithOutput(*model, intermediate_name);
   if (it == model->operators.end()) {
     AddMessageF("No preceding operator");
@@ -103,8 +103,8 @@ bool IsMoveOperator(OperatorType optype) {
   }
 
   // op->inputs may change so we need to keep a value by copy.
-  const string input_name = move_op->inputs[0];
-  const string output_name = element_op->outputs[0];
+  const std::string input_name = move_op->inputs[0];
+  const std::string output_name = element_op->outputs[0];
 
   AddMessageF("Swapping around operators with %s and %s", LogName(*element_op),
               LogName(*move_op));
diff --git a/tensorflow/lite/toco/graph_transformations/reorder_reshape_transpose.cc b/tensorflow/lite/toco/graph_transformations/reorder_reshape_transpose.cc
index 0fbcf9f73b1..9852b3382cd 100644
--- a/tensorflow/lite/toco/graph_transformations/reorder_reshape_transpose.cc
+++ b/tensorflow/lite/toco/graph_transformations/reorder_reshape_transpose.cc
@@ -138,9 +138,9 @@ std::vector<int> ComputeNewPerm(std::vector<int> input_dims,
   }
 
   // Need to copy to keep static if permutated.
-  const string input_name = reshape_op->inputs[0];
-  const string intermediate_name = reshape_op->outputs[0];
-  const string output_name = transpose_op->outputs[0];
+  const std::string input_name = reshape_op->inputs[0];
+  const std::string intermediate_name = reshape_op->outputs[0];
+  const std::string output_name = transpose_op->outputs[0];
 
   // Intermediate should not be consumed by any other operators.
   if (CountOpsWithInput(*model, intermediate_name) != 1) {
diff --git a/tensorflow/lite/toco/graph_transformations/resolve_batch_normalization.cc b/tensorflow/lite/toco/graph_transformations/resolve_batch_normalization.cc
index 6e5815ee94d..aee511e2beb 100644
--- a/tensorflow/lite/toco/graph_transformations/resolve_batch_normalization.cc
+++ b/tensorflow/lite/toco/graph_transformations/resolve_batch_normalization.cc
@@ -62,12 +62,14 @@ namespace toco {
   // Create the new Mul, Add operators
   auto* mul_op = new MulOperator;
   auto* add_op = new AddOperator;
-  const string mul_name =
+  const std::string mul_name =
       AvailableArrayName(*model, bn_op->outputs[0] + "_mul");
-  const string add_name =
+  const std::string add_name =
       AvailableArrayName(*model, bn_op->outputs[0] + "_add");
-  const string mul_param_name = AvailableArrayName(*model, mul_name + "_param");
-  const string add_param_name = AvailableArrayName(*model, add_name + "_param");
+  const std::string mul_param_name =
+      AvailableArrayName(*model, mul_name + "_param");
+  const std::string add_param_name =
+      AvailableArrayName(*model, add_name + "_param");
   mul_op->inputs = {bn_op->inputs[0], mul_param_name};
   mul_op->outputs = {mul_name};
   add_op->inputs = {mul_name, add_param_name};
diff --git a/tensorflow/lite/toco/graph_transformations/resolve_constant_concatenation.cc b/tensorflow/lite/toco/graph_transformations/resolve_constant_concatenation.cc
index 7c9aa025f64..208c345639b 100644
--- a/tensorflow/lite/toco/graph_transformations/resolve_constant_concatenation.cc
+++ b/tensorflow/lite/toco/graph_transformations/resolve_constant_concatenation.cc
@@ -147,7 +147,7 @@ void SetMinMaxForConcatenedArray(GraphTransformation* transformation,
   const auto* concat_op =
       static_cast<const ConcatenationOperator*>(concat_base_op);
 
-  for (const string& input_name : concat_op->inputs) {
+  for (const std::string& input_name : concat_op->inputs) {
     // We only expect constant unquantized arrays as input, otherwise we return.
     // We  also make sure the shapes of the input arrays are known and they are
     // all discardable.
@@ -166,10 +166,10 @@ void SetMinMaxForConcatenedArray(GraphTransformation* transformation,
   const int concatenation_axis = concat_op->axis;
 
   CHECK_EQ(concat_op->outputs.size(), 1);
-  string concatenated_array_name = concat_op->outputs[0];
+  std::string concatenated_array_name = concat_op->outputs[0];
   Array& concatenated_array = model->GetOrCreateArray(concatenated_array_name);
   std::vector<Array*> input_arrays;
-  for (const string& input_name : concat_op->inputs) {
+  for (const std::string& input_name : concat_op->inputs) {
     input_arrays.push_back(&model->GetArray(input_name));
   }
 
diff --git a/tensorflow/lite/toco/graph_transformations/resolve_reorder_axes.cc b/tensorflow/lite/toco/graph_transformations/resolve_reorder_axes.cc
index a3d3e863757..a685f67745b 100644
--- a/tensorflow/lite/toco/graph_transformations/resolve_reorder_axes.cc
+++ b/tensorflow/lite/toco/graph_transformations/resolve_reorder_axes.cc
@@ -27,19 +27,19 @@ namespace toco {
 
 namespace {
 
-void RenameArray(Model* model, const string& oldname,
-                 const string& desired_newname) {
-  const string& newname = AvailableArrayName(*model, desired_newname);
+void RenameArray(Model* model, const std::string& oldname,
+                 const std::string& desired_newname) {
+  const std::string& newname = AvailableArrayName(*model, desired_newname);
   auto& arrays = model->GetMutableArrayMap();
   arrays[newname] = std::move(arrays[oldname]);
   arrays.erase(oldname);
   for (const auto& op : model->operators) {
-    for (string& input : op->inputs) {
+    for (std::string& input : op->inputs) {
       if (input == oldname) {
         input = newname;
       }
     }
-    for (string& output : op->outputs) {
+    for (std::string& output : op->outputs) {
       if (output == oldname) {
         output = newname;
       }
@@ -89,8 +89,8 @@ void ReorderAxes(AxesOrder input_axes_order, AxesOrder output_axes_order,
   auto* reorder_op = static_cast<ReorderAxesOperator*>(op);
 
   // Intentionally copies, not references.
-  const string input_array_name = reorder_op->inputs[0];
-  const string output_array_name = reorder_op->outputs[0];
+  const std::string input_array_name = reorder_op->inputs[0];
+  const std::string output_array_name = reorder_op->outputs[0];
 
   auto& input_array = model->GetArray(input_array_name);
   auto& output_array = model->GetArray(output_array_name);
diff --git a/tensorflow/lite/toco/graph_transformations/resolve_tensorflow_concat.cc b/tensorflow/lite/toco/graph_transformations/resolve_tensorflow_concat.cc
index b5a11529764..1ce05336be9 100644
--- a/tensorflow/lite/toco/graph_transformations/resolve_tensorflow_concat.cc
+++ b/tensorflow/lite/toco/graph_transformations/resolve_tensorflow_concat.cc
@@ -44,8 +44,8 @@ namespace toco {
   if (tf_concat_op->type == OperatorType::kConcatV2) {
     axis_pos = tf_concat_op->inputs.size() - 1;
   }
-  const string axis_name = tf_concat_op->inputs[axis_pos];
-  std::vector<string> concat_input_names;
+  const std::string axis_name = tf_concat_op->inputs[axis_pos];
+  std::vector<std::string> concat_input_names;
   for (std::size_t i = 0; i < tf_concat_op->inputs.size(); i++) {
     if (i != axis_pos) {
       concat_input_names.push_back(tf_concat_op->inputs[i]);
diff --git a/tensorflow/lite/toco/graph_transformations/resolve_tensorflow_matmul.cc b/tensorflow/lite/toco/graph_transformations/resolve_tensorflow_matmul.cc
index ac95d609e91..a6d653d055d 100644
--- a/tensorflow/lite/toco/graph_transformations/resolve_tensorflow_matmul.cc
+++ b/tensorflow/lite/toco/graph_transformations/resolve_tensorflow_matmul.cc
@@ -27,7 +27,7 @@ namespace toco {
 namespace {
 
 TransposeOperator* FindTransposeOpWithInput(const Model& model,
-                                            const string& array_name) {
+                                            const std::string& array_name) {
   for (auto it = model.operators.begin(); it != model.operators.end(); ++it) {
     Operator* op = it->get();
     if (op->type != OperatorType::kTranspose) {
@@ -74,8 +74,8 @@ TransposeOperator* FindTransposeOpWithInput(const Model& model,
     DCHECK_EQ(matmul_it->get(), matmul_op);
   };
 
-  string input_lhs = matmul_op->inputs[0];
-  string input_rhs = matmul_op->inputs[1];
+  std::string input_lhs = matmul_op->inputs[0];
+  std::string input_rhs = matmul_op->inputs[1];
 
   // Handle `transpose_a` with best effort: If the dimension of lhs is known,
   // insert a `Transpose` op.
diff --git a/tensorflow/lite/toco/graph_transformations/resolve_tensorflow_switch.cc b/tensorflow/lite/toco/graph_transformations/resolve_tensorflow_switch.cc
index 5c3176ced34..854dce39a27 100644
--- a/tensorflow/lite/toco/graph_transformations/resolve_tensorflow_switch.cc
+++ b/tensorflow/lite/toco/graph_transformations/resolve_tensorflow_switch.cc
@@ -37,7 +37,7 @@ namespace toco {
 
   CHECK_EQ(switch_op->inputs.size(), 2);
   CHECK_EQ(switch_op->outputs.size(), 2);
-  const string& predicate_name = switch_op->inputs[1];
+  const std::string& predicate_name = switch_op->inputs[1];
   // If the predicate array hasn't been resolved to a constant yet,
   // we need to yield.
   if (!IsConstantParameterArray(*model, predicate_name)) {
diff --git a/tensorflow/lite/toco/graph_transformations/shuffle_fc_weights.cc b/tensorflow/lite/toco/graph_transformations/shuffle_fc_weights.cc
index 195ea70e34b..7eadd01c949 100644
--- a/tensorflow/lite/toco/graph_transformations/shuffle_fc_weights.cc
+++ b/tensorflow/lite/toco/graph_transformations/shuffle_fc_weights.cc
@@ -37,7 +37,7 @@ namespace toco {
     return ::tensorflow::Status::OK();
   }
   const Array& input_array = model->GetArray(fc_op->inputs[0]);
-  const string& weights_name = fc_op->inputs[1];
+  const std::string& weights_name = fc_op->inputs[1];
   Array& weights_array = model->GetArray(weights_name);
   const Array& output_array = model->GetArray(fc_op->outputs[0]);
   // Exit if this FC op isn't quantized with uint8 inputs and int16 outputs,
@@ -143,7 +143,7 @@ namespace toco {
   // Add a second output array to this FC op, serving as a workspace to perform
   // runtime shuffling/xoring of its input activations.
   CHECK_EQ(fc_op->outputs.size(), 1);
-  const string& shuffled_input_workspace_array_name =
+  const std::string& shuffled_input_workspace_array_name =
       AvailableArrayName(*model, fc_op->inputs[0] + "_shuffled");
   fc_op->outputs.push_back(shuffled_input_workspace_array_name);
   auto& shuffled_input_workspace_array =
diff --git a/tensorflow/lite/toco/graph_transformations/tests/fuse_binary_into_following_affine_test.cc b/tensorflow/lite/toco/graph_transformations/tests/fuse_binary_into_following_affine_test.cc
index 2cba6824cfb..d6cf31f4211 100644
--- a/tensorflow/lite/toco/graph_transformations/tests/fuse_binary_into_following_affine_test.cc
+++ b/tensorflow/lite/toco/graph_transformations/tests/fuse_binary_into_following_affine_test.cc
@@ -43,14 +43,15 @@ class FuseBinaryIntoFollowingAffineTest : public ::testing::Test {
 
   void SetUp() override { model_.reset(new Model); }
 
-  void CreateArray(const string& name, const std::vector<int>& shape) {
+  void CreateArray(const std::string& name, const std::vector<int>& shape) {
     Array& array = model_->GetOrCreateArray(name);
     array.data_type = ArrayDataType::kFloat;
     Shape* array_shape = array.mutable_shape();
     *(array_shape->mutable_dims()) = shape;
   }
 
-  void CreateConstantArray(const string& name, const std::vector<int>& shape,
+  void CreateConstantArray(const std::string& name,
+                           const std::vector<int>& shape,
                            const std::vector<float>& data) {
     CreateArray(name, shape);
     Array& array = model_->GetOrCreateArray(name);
diff --git a/tensorflow/lite/toco/graph_transformations/tests/fuse_binary_into_preceding_affine_test.cc b/tensorflow/lite/toco/graph_transformations/tests/fuse_binary_into_preceding_affine_test.cc
index b5c321c1a26..6c3dc7dc761 100644
--- a/tensorflow/lite/toco/graph_transformations/tests/fuse_binary_into_preceding_affine_test.cc
+++ b/tensorflow/lite/toco/graph_transformations/tests/fuse_binary_into_preceding_affine_test.cc
@@ -43,14 +43,15 @@ class FuseBinaryIntoPrecedingAffineTest : public ::testing::Test {
 
   void SetUp() override { model_.reset(new Model); }
 
-  void CreateArray(const string& name, const std::vector<int>& shape) {
+  void CreateArray(const std::string& name, const std::vector<int>& shape) {
     Array& array = model_->GetOrCreateArray(name);
     array.data_type = ArrayDataType::kFloat;
     Shape* array_shape = array.mutable_shape();
     *(array_shape->mutable_dims()) = shape;
   }
 
-  void CreateConstantArray(const string& name, const std::vector<int>& shape,
+  void CreateConstantArray(const std::string& name,
+                           const std::vector<int>& shape,
                            const std::vector<float>& data) {
     CreateArray(name, shape);
     Array& array = model_->GetOrCreateArray(name);
diff --git a/tensorflow/lite/toco/graph_transformations/tests/lstm_utils_test.cc b/tensorflow/lite/toco/graph_transformations/tests/lstm_utils_test.cc
index bdb27e8af2e..204e197e186 100644
--- a/tensorflow/lite/toco/graph_transformations/tests/lstm_utils_test.cc
+++ b/tensorflow/lite/toco/graph_transformations/tests/lstm_utils_test.cc
@@ -46,12 +46,12 @@ class CopyArrayDataTest : public ::testing::Test {
                       int src_dim_1, int src_dim_2,
                       std::initializer_list<float> dst_data, int dst_dim_1,
                       int dst_dim_2) {
-    string src_array = "src_array";
+    std::string src_array = "src_array";
     src_buffer_ = CreateFloatArrayBuffer(
         model, &src_array,
         src_dim_2 == 1 ? Shape({src_dim_1}) : Shape({src_dim_1, src_dim_2}));
     PopulateBuffer(src_buffer_, src_data);
-    string dst_array = "dst_array";
+    std::string dst_array = "dst_array";
     dst_buffer_ = CreateFloatArrayBuffer(
         model, &dst_array,
         dst_dim_2 == 1 ? Shape({dst_dim_1}) : Shape({dst_dim_1, dst_dim_2}));
diff --git a/tensorflow/lite/toco/graph_transformations/tests/resolve_constant_concatenation_test.cc b/tensorflow/lite/toco/graph_transformations/tests/resolve_constant_concatenation_test.cc
index bfed38ce7aa..5b0566fe074 100644
--- a/tensorflow/lite/toco/graph_transformations/tests/resolve_constant_concatenation_test.cc
+++ b/tensorflow/lite/toco/graph_transformations/tests/resolve_constant_concatenation_test.cc
@@ -107,10 +107,10 @@ class ResolveConstantConcatenationTest : public ::testing::Test {
   // together with 4 arrays as its inputs.
   // It receives the dimension of concatenation as input.
   void PrepareModel(Model* model, int axis) {
-    const string output_name("concat_op_output");
+    const std::string output_name("concat_op_output");
     model->flags.add_output_arrays(output_name);
-    std::vector<string> concat_input_names = {"array0", "array1", "array2",
-                                              "array3"};
+    std::vector<std::string> concat_input_names = {"array0", "array1", "array2",
+                                                   "array3"};
 
     const int kDim = 3;
     const int kElementPerDim = 2;
@@ -122,7 +122,7 @@ class ResolveConstantConcatenationTest : public ::testing::Test {
         {20., 21., 22., 23., 24., 25., 26., 27.},
         {30., 31., 32., 33., 34., 35., 36., 37.}};
     int cnt = 0;
-    for (const string& concat_input_name : concat_input_names) {
+    for (const std::string& concat_input_name : concat_input_names) {
       Array& in_array = model->GetOrCreateArray(concat_input_name);
       in_array.data_type = ArrayDataType::kFloat;
 
diff --git a/tensorflow/lite/toco/graph_transformations/tests/unpack_quantize_test.cc b/tensorflow/lite/toco/graph_transformations/tests/unpack_quantize_test.cc
index 2dc3fb35b0f..3cc4e725463 100755
--- a/tensorflow/lite/toco/graph_transformations/tests/unpack_quantize_test.cc
+++ b/tensorflow/lite/toco/graph_transformations/tests/unpack_quantize_test.cc
@@ -40,10 +40,11 @@ class UnpackQuantizeTest : public ::testing::Test {
   // 1. calculate min and max of the input.
   // 2. insert dequantization nodes after quantized outputs of Unpack operation.
   void PrepareModel(Model* model, int axis) {
-    std::vector<string> unpack_output_names = {"unpack_out0", "unpack_out1"};
+    std::vector<std::string> unpack_output_names = {"unpack_out0",
+                                                    "unpack_out1"};
     model->flags.add_output_arrays(unpack_output_names[0]);
     model->flags.add_output_arrays(unpack_output_names[1]);
-    const string unpack_input_name("unpack_op_input");
+    const std::string unpack_input_name("unpack_op_input");
 
     const int kDim = 2;
     const int kElementPerDim = 2;
@@ -75,7 +76,7 @@ class UnpackQuantizeTest : public ::testing::Test {
     // Configuring the necessary outputs. The outputs also happen to be in
     // kFloat. This is because during quantization transformation data types for
     // these arrays are going to be forced to be kUint8.
-    for (const string& unpack_output_name : unpack_output_names) {
+    for (const std::string& unpack_output_name : unpack_output_names) {
       Array& out_array = model->GetOrCreateArray(unpack_output_name);
       out_array.GetOrCreateMinMax();
       out_array.data_type = ArrayDataType::kFloat;
@@ -109,7 +110,7 @@ TEST_F(UnpackQuantizeTest, CheckUnpackPreservesQuantizationParameters) {
                   ->Run(&model, /*op_index=*/0, &modified)
                   .ok());
 
-  const string output_name = model.flags.output_arrays(0);
+  const std::string output_name = model.flags.output_arrays(0);
 
   // Quantization transformation inserts NODE_NAME_DEQUANTIZE operations,
   // effectively making them the new outputs of the array. Old outputs of the
diff --git a/tensorflow/lite/toco/graph_transformations/unfuse_activation_functions.cc b/tensorflow/lite/toco/graph_transformations/unfuse_activation_functions.cc
index 3e36dd5a45c..2a3cd91551b 100644
--- a/tensorflow/lite/toco/graph_transformations/unfuse_activation_functions.cc
+++ b/tensorflow/lite/toco/graph_transformations/unfuse_activation_functions.cc
@@ -64,7 +64,7 @@ namespace toco {
   // Wire up arrays, constructing a new intermediate array to connect the
   // op to its new unfused activation function.
   ac_op->outputs = op->outputs;
-  const string& tmp_array_name =
+  const std::string& tmp_array_name =
       AvailableArrayName(*model, op->outputs[0] + "_unfused");
   CHECK(!model->HasArray(tmp_array_name));
 
diff --git a/tensorflow/lite/toco/graph_transformations/unpartition_embedding_lookup.cc b/tensorflow/lite/toco/graph_transformations/unpartition_embedding_lookup.cc
index 1f7035c21e2..294c39069f7 100644
--- a/tensorflow/lite/toco/graph_transformations/unpartition_embedding_lookup.cc
+++ b/tensorflow/lite/toco/graph_transformations/unpartition_embedding_lookup.cc
@@ -55,8 +55,8 @@ namespace toco {
   auto* stitch_op = static_cast<DynamicStitchOperator*>(op_it->get());
 
   // Split up the DynamicStitch inputs into the indices and data.
-  std::vector<string> stitch_indices_inputs;
-  std::vector<string> stitch_data_inputs;
+  std::vector<std::string> stitch_indices_inputs;
+  std::vector<std::string> stitch_data_inputs;
   for (size_t i = 0; i < stitch_op->num_partitions; ++i) {
     stitch_indices_inputs.push_back(stitch_op->inputs[i]);
   }
@@ -67,7 +67,8 @@ namespace toco {
 
   // Validate all indices come from the same DynamicPartition.
   DynamicPartitionOperator* indices_partition_op = nullptr;
-  for (const string& indices_partition_output_name : stitch_indices_inputs) {
+  for (const std::string& indices_partition_output_name :
+       stitch_indices_inputs) {
     auto* op = GetOpWithOutput(*model, indices_partition_output_name);
     CHECK(op) << "Source of " << indices_partition_output_name << " not found";
     if (op->type != OperatorType::kDynamicPartition) {
@@ -112,7 +113,7 @@ namespace toco {
 
   // Find all of the gathers used for the data inputs.
   std::vector<GatherOperator*> gather_ops;
-  for (const string& gather_output_name : stitch_data_inputs) {
+  for (const std::string& gather_output_name : stitch_data_inputs) {
     auto* op = GetOpWithOutput(*model, gather_output_name);
     CHECK(op) << "Source of " << gather_output_name << " not found";
     if (op->type != OperatorType::kGather) {
diff --git a/tensorflow/lite/toco/graph_transformations/unroll_batch_matmul.cc b/tensorflow/lite/toco/graph_transformations/unroll_batch_matmul.cc
index 16dfaf7fc80..5bf000e2784 100644
--- a/tensorflow/lite/toco/graph_transformations/unroll_batch_matmul.cc
+++ b/tensorflow/lite/toco/graph_transformations/unroll_batch_matmul.cc
@@ -34,9 +34,10 @@ absl::InlinedVector<int64, 4> ToInlinedVector(const std::vector<int>& vec) {
   return absl::InlinedVector<int64, 4>(vec.begin(), vec.end());
 }
 
-std::vector<string> SliceInput(
-    const string& input, const string& base_name, const string& input_name,
-    const int batch_size, const Array& input_array, Model* model,
+std::vector<std::string> SliceInput(
+    const std::string& input, const std::string& base_name,
+    const std::string& input_name, const int batch_size,
+    const Array& input_array, Model* model,
     std::vector<std::unique_ptr<Operator>>::iterator* tail_it) {
   int rank = input_array.shape().dimensions_count();
   int num_rows = input_array.shape().dims(rank - 2);
@@ -54,7 +55,7 @@ std::vector<string> SliceInput(
   *tail_it = model->operators.emplace(*tail_it, reshape_op) + 1;
 
   // Slice along each batch index and remember the slice output for future use.
-  std::vector<string> slice_outputs;
+  std::vector<std::string> slice_outputs;
   for (int batch_idx = 0; batch_idx < batch_size; ++batch_idx) {
     std::string batch_name =
         absl::StrCat(base_name, "_b", batch_idx, "/slice_", input_name);
@@ -110,10 +111,10 @@ std::vector<int32> GetTransposeShape(const Shape& input_shape,
   return output_shape;
 }
 
-TransposeOperator* TransposeInput(const string& input, Model* model) {
+TransposeOperator* TransposeInput(const std::string& input, Model* model) {
   const auto& input_array = model->GetArray(input);
   const auto perm_array = GetTransposePerm(input_array);
-  const string perm_array_name = CreateInt32Array(
+  const std::string perm_array_name = CreateInt32Array(
       model, AvailableArrayName(*model, input + "/transpose/perm"), perm_array);
   auto* transpose_op = new TransposeOperator;
   transpose_op->inputs = {input, perm_array_name};
@@ -141,8 +142,8 @@ TransposeOperator* TransposeInput(const string& input, Model* model) {
       static_cast<const BatchMatMulOperator*>(batch_op_it->get());
   auto& tail_it = batch_op_it;
 
-  string input_lhs = batch_op->inputs[0];
-  string input_rhs = batch_op->inputs[1];
+  std::string input_lhs = batch_op->inputs[0];
+  std::string input_rhs = batch_op->inputs[1];
   const auto& input_lhs_array = model->GetArray(input_lhs);
   const auto& input_rhs_array = model->GetArray(input_rhs);
   if (!input_lhs_array.has_shape() || !input_rhs_array.has_shape())
@@ -195,19 +196,19 @@ TransposeOperator* TransposeInput(const string& input, Model* model) {
   }
   AddMessageF("Unrolling BatchMatMul %s %d times", LogName(*batch_op),
               bcast.output_batch_size());
-  string base_name = std::string(batch_op->outputs[0]);
+  std::string base_name = std::string(batch_op->outputs[0]);
 
   // Compute slices for each batch in the LHS and RHS.
-  std::vector<string> slice_a_outputs =
+  std::vector<std::string> slice_a_outputs =
       SliceInput(input_lhs, base_name, "a", bcast.x_batch_size(), input_array_a,
                  model, &tail_it);
-  std::vector<string> slice_b_outputs =
+  std::vector<std::string> slice_b_outputs =
       SliceInput(input_rhs, base_name, "b", bcast.y_batch_size(), input_array_b,
                  model, &tail_it);
 
   // Compute (single batch) MatMul for each output batch. The MatMul outputs are
   // then packed together into one output Tensor.
-  std::vector<string> pack_inputs;
+  std::vector<std::string> pack_inputs;
   for (int batch_idx = 0; batch_idx < bcast.output_batch_size(); ++batch_idx) {
     std::string batch_name =
         absl::StrCat(batch_op->outputs[0], "_b", batch_idx);
diff --git a/tensorflow/lite/toco/import_tensorflow.cc b/tensorflow/lite/toco/import_tensorflow.cc
index 3124133047e..2adfe838c3d 100644
--- a/tensorflow/lite/toco/import_tensorflow.cc
+++ b/tensorflow/lite/toco/import_tensorflow.cc
@@ -67,7 +67,7 @@ using tensorflow::TensorShapeProto;
 namespace toco {
 
 namespace {
-bool HasAttr(const NodeDef& node, const string& attr_name) {
+bool HasAttr(const NodeDef& node, const std::string& attr_name) {
   return node.attr().count(attr_name) > 0;
 }
 
@@ -78,14 +78,15 @@ bool HasWildcardDimension(const TensorShapeProto& shape) {
   return false;
 }
 
-const string& GetStringAttr(const NodeDef& node, const string& attr_name) {
+const std::string& GetStringAttr(const NodeDef& node,
+                                 const std::string& attr_name) {
   CHECK(HasAttr(node, attr_name));
   const auto& attr = node.attr().at(attr_name);
   CHECK_EQ(attr.value_case(), AttrValue::kS);
   return attr.s();
 }
 
-int64 GetIntAttr(const NodeDef& node, const string& attr_name) {
+int64 GetIntAttr(const NodeDef& node, const std::string& attr_name) {
   CHECK(HasAttr(node, attr_name)) << attr_name << " not found in:\n"
                                   << node.DebugString();
   const auto& attr = node.attr().at(attr_name);
@@ -93,14 +94,14 @@ int64 GetIntAttr(const NodeDef& node, const string& attr_name) {
   return attr.i();
 }
 
-float GetFloatAttr(const NodeDef& node, const string& attr_name) {
+float GetFloatAttr(const NodeDef& node, const std::string& attr_name) {
   CHECK(HasAttr(node, attr_name));
   const auto& attr = node.attr().at(attr_name);
   CHECK_EQ(attr.value_case(), AttrValue::kF);
   return attr.f();
 }
 
-bool GetBoolAttr(const NodeDef& node, const string& attr_name) {
+bool GetBoolAttr(const NodeDef& node, const std::string& attr_name) {
   CHECK(HasAttr(node, attr_name));
   const auto& attr = node.attr().at(attr_name);
   CHECK_EQ(attr.value_case(), AttrValue::kB);
@@ -108,7 +109,7 @@ bool GetBoolAttr(const NodeDef& node, const string& attr_name) {
 }
 
 tensorflow::DataType GetDataTypeAttr(const NodeDef& node,
-                                     const string& attr_name) {
+                                     const std::string& attr_name) {
   CHECK(HasAttr(node, attr_name));
   const auto& attr = node.attr().at(attr_name);
   CHECK_EQ(attr.value_case(), AttrValue::kType);
@@ -116,14 +117,15 @@ tensorflow::DataType GetDataTypeAttr(const NodeDef& node,
 }
 
 const TensorShapeProto& GetShapeAttr(const NodeDef& node,
-                                     const string& attr_name) {
+                                     const std::string& attr_name) {
   CHECK(HasAttr(node, attr_name));
   const auto& attr = node.attr().at(attr_name);
   CHECK_EQ(attr.value_case(), AttrValue::kShape);
   return attr.shape();
 }
 
-const TensorProto& GetTensorAttr(const NodeDef& node, const string& attr_name) {
+const TensorProto& GetTensorAttr(const NodeDef& node,
+                                 const std::string& attr_name) {
   CHECK(HasAttr(node, attr_name)) << "No attr named '" << attr_name << "'";
   const auto& attr = node.attr().at(attr_name);
   CHECK_EQ(attr.value_case(), AttrValue::kTensor);
@@ -131,7 +133,7 @@ const TensorProto& GetTensorAttr(const NodeDef& node, const string& attr_name) {
 }
 
 const AttrValue::ListValue& GetListAttr(const NodeDef& node,
-                                        const string& attr_name) {
+                                        const std::string& attr_name) {
   CHECK(HasAttr(node, attr_name));
   const auto& attr = node.attr().at(attr_name);
   CHECK_EQ(attr.value_case(), AttrValue::kList);
@@ -139,10 +141,10 @@ const AttrValue::ListValue& GetListAttr(const NodeDef& node,
 }
 
 tensorflow::Status CheckOptionalAttr(const NodeDef& node,
-                                     const string& attr_name,
-                                     const string& expected_value) {
+                                     const std::string& attr_name,
+                                     const std::string& expected_value) {
   if (HasAttr(node, attr_name)) {
-    const string& value = GetStringAttr(node, attr_name);
+    const std::string& value = GetStringAttr(node, attr_name);
     if (value != expected_value) {
       return tensorflow::errors::InvalidArgument(
           "Unexpected value for attribute '" + attr_name + "'. Expected '" +
@@ -153,7 +155,7 @@ tensorflow::Status CheckOptionalAttr(const NodeDef& node,
 }
 
 tensorflow::Status CheckOptionalAttr(
-    const NodeDef& node, const string& attr_name,
+    const NodeDef& node, const std::string& attr_name,
     const tensorflow::DataType& expected_value) {
   if (HasAttr(node, attr_name)) {
     const tensorflow::DataType& value = GetDataTypeAttr(node, attr_name);
@@ -168,7 +170,7 @@ tensorflow::Status CheckOptionalAttr(
 
 template <typename T1, typename T2>
 tensorflow::Status ExpectValue(const T1& v1, const T2& v2,
-                               const string& description) {
+                               const std::string& description) {
   if (v1 == v2) return tensorflow::Status::OK();
   return tensorflow::errors::InvalidArgument(absl::StrCat(
       "Unexpected ", description, ": got ", v1, ", expected ", v2));
@@ -244,8 +246,8 @@ template <>
 struct TensorTraits<float> {
   static int size(const TensorProto& p) { return p.float_val_size(); }
   static float get(const TensorProto& p, int i) { return p.float_val(i); }
-  static string accessor_name() { return "float_val"; }
-  static string type_name() { return "float"; }
+  static std::string accessor_name() { return "float_val"; }
+  static std::string type_name() { return "float"; }
   static void CopyFromContent(const TensorProto& p, std::vector<float>* data) {
     toco::port::CopyToBuffer(p.tensor_content(),
                              reinterpret_cast<char*>(data->data()));
@@ -256,8 +258,8 @@ template <>
 struct TensorTraits<uint8_t> {
   static int size(const TensorProto& p) { return p.int_val_size(); }
   static uint8_t get(const TensorProto& p, int i) { return p.int_val(i); }
-  static string accessor_name() { return "int_val"; }
-  static string type_name() { return "uint8"; }
+  static std::string accessor_name() { return "int_val"; }
+  static std::string type_name() { return "uint8"; }
   static void CopyFromContent(const TensorProto& p,
                               std::vector<uint8_t>* data) {
     toco::port::CopyToBuffer(p.tensor_content(),
@@ -272,8 +274,8 @@ struct TensorTraits<std::complex<float>> {
     return std::complex<float>(p.scomplex_val(2 * i),
                                p.scomplex_val(2 * i + 1));
   }
-  static string accessor_name() { return "scomplex_val"; }
-  static string type_name() { return "complex64"; }
+  static std::string accessor_name() { return "scomplex_val"; }
+  static std::string type_name() { return "complex64"; }
   static void CopyFromContent(const TensorProto& p,
                               std::vector<std::complex<float>>* data) {
     toco::port::CopyToBuffer(p.tensor_content(),
@@ -285,8 +287,8 @@ template <>
 struct TensorTraits<int32> {
   static int size(const TensorProto& p) { return p.int_val_size(); }
   static int32 get(const TensorProto& p, int i) { return p.int_val(i); }
-  static string accessor_name() { return "int_val"; }
-  static string type_name() { return "int32"; }
+  static std::string accessor_name() { return "int_val"; }
+  static std::string type_name() { return "int32"; }
   static void CopyFromContent(const TensorProto& p, std::vector<int32>* data) {
     toco::port::CopyToBuffer(p.tensor_content(),
                              reinterpret_cast<char*>(data->data()));
@@ -297,8 +299,8 @@ template <>
 struct TensorTraits<int64> {
   static int size(const TensorProto& p) { return p.int64_val_size(); }
   static int64 get(const TensorProto& p, int i) { return p.int64_val(i); }
-  static string accessor_name() { return "int64_val"; }
-  static string type_name() { return "int64"; }
+  static std::string accessor_name() { return "int64_val"; }
+  static std::string type_name() { return "int64"; }
   static void CopyFromContent(const TensorProto& p, std::vector<int64>* data) {
     toco::port::CopyToBuffer(p.tensor_content(),
                              reinterpret_cast<char*>(data->data()));
@@ -309,8 +311,8 @@ template <>
 struct TensorTraits<bool> {
   static int size(const TensorProto& p) { return p.bool_val_size(); }
   static bool get(const TensorProto& p, int i) { return p.bool_val(i); }
-  static string accessor_name() { return "bool_val"; }
-  static string type_name() { return "bool"; }
+  static std::string accessor_name() { return "bool_val"; }
+  static std::string type_name() { return "bool"; }
   static void CopyFromContent(const TensorProto& p, std::vector<bool>* data) {
     std::vector<char> buf(p.tensor_content().size());
     toco::port::CopyToBuffer(p.tensor_content(), buf.data());
@@ -348,8 +350,8 @@ tensorflow::Status ImportTensorData(const TensorProto& input_tensor,
       (*output_data)[i] = last;
     }
   } else {
-    string accessor_name = TensorTraits<T>::accessor_name();
-    string type_name = TensorTraits<T>::type_name();
+    std::string accessor_name = TensorTraits<T>::accessor_name();
+    std::string type_name = TensorTraits<T>::type_name();
     return tensorflow::errors::InvalidArgument(
         absl::StrCat("Neither input_content (",
                      input_tensor.tensor_content().size() / sizeof(T), ") nor ",
@@ -527,10 +529,11 @@ tensorflow::Status CheckInputsCount(
 }
 
 template <ArrayDataType T>
-string CreateConstArray(Model* model, string const& name,
-                        std::vector<typename toco::DataType<T> > const& data) {
+std::string CreateConstArray(
+    Model* model, std::string const& name,
+    std::vector<typename toco::DataType<T>> const& data) {
   // Utility function to create a const 1D array, useful for input parameters.
-  string array_name = toco::AvailableArrayName(*model, name);
+  std::string array_name = toco::AvailableArrayName(*model, name);
   auto& array = model->GetOrCreateArray(array_name);
   array.data_type = T;
   array.mutable_shape()->mutable_dims()->emplace_back(
@@ -576,7 +579,7 @@ void GetOutputNamesFromNodeDef(const NodeDef& node,
     ++next_output;
   };
   for (int i = 0; i < op_def.output_arg_size(); ++i) {
-    string multiples = op_def.output_arg(i).number_attr();
+    std::string multiples = op_def.output_arg(i).number_attr();
     if (!multiples.empty()) {
       CHECK(HasAttr(node, multiples)) << "No attr named " << multiples;
       int num_outputs = GetIntAttr(node, multiples);
@@ -584,7 +587,7 @@ void GetOutputNamesFromNodeDef(const NodeDef& node,
         add_output();
       }
     } else {
-      string list = op_def.output_arg(i).type_list_attr();
+      std::string list = op_def.output_arg(i).type_list_attr();
       if (!list.empty()) {
         CHECK(HasAttr(node, list)) << "No attr named " << list;
         const AttrValue::ListValue& list_value = GetListAttr(node, list);
@@ -624,7 +627,7 @@ void GetOutputTypesFromNodeDef(const NodeDef& node,
   };
 
   for (int i = 0; i < op_def.output_arg_size(); ++i) {
-    string multiples = op_def.output_arg(i).number_attr();
+    std::string multiples = op_def.output_arg(i).number_attr();
     if (!multiples.empty()) {
       CHECK(HasAttr(node, multiples)) << "No attr named " << multiples;
       int num_outputs = GetIntAttr(node, multiples);
@@ -633,7 +636,7 @@ void GetOutputTypesFromNodeDef(const NodeDef& node,
         add_type(type);
       }
     } else {
-      string list = op_def.output_arg(i).type_list_attr();
+      std::string list = op_def.output_arg(i).type_list_attr();
       if (!list.empty()) {
         CHECK(HasAttr(node, list)) << "No attr named " << list;
         const AttrValue::ListValue& list_value = GetListAttr(node, list);
@@ -1057,7 +1060,7 @@ tensorflow::Status ConvertIdentityNOperator(
   for (int i = 0; i < node.input_size(); ++i) {
     auto* op = new TensorFlowIdentityOperator;
     const auto& input_name = node.input(i);
-    string output_name = node.name();
+    std::string output_name = node.name();
     if (i > 0) {
       output_name = output_name + ":" + std::to_string(i);
     }
@@ -1756,13 +1759,13 @@ tensorflow::Status ConvertBatchNormWithGlobalNormalizationOperator(
   // to the input, before feeding it into TensorFlowRsqrtOperator.
   // CHECK_EQ(GetFloatAttr(node, "variance_epsilon"), 0.001f);
 
-  string multiplier = node.name() + "_mul";
+  std::string multiplier = node.name() + "_mul";
   if (GetBoolAttr(node, "scale_after_normalization")) {
     // Create graph:
     //   v -> RSQRT ->
     //                 MUL  -> multiplier
     //   gamma  ----->
-    string rsqrt = node.name() + "_rsqrt";
+    std::string rsqrt = node.name() + "_rsqrt";
 
     auto* rsqrt_op = new TensorFlowRsqrtOperator;
     rsqrt_op->inputs.push_back(node.input(2));
@@ -1803,17 +1806,19 @@ tensorflow::Status ConvertFusedBatchNormOperator(
   TF_QCHECK_OK(CheckInputsCount(node, tf_import_flags, 5));
 
   // Declare shortcuts for the inputs.
-  const string& gamma_input = node.input(1);
-  const string& beta_input = node.input(2);
-  const string& moving_mean_input = node.input(3);
-  const string& moving_variance_input = node.input(4);
+  const std::string& gamma_input = node.input(1);
+  const std::string& beta_input = node.input(2);
+  const std::string& moving_mean_input = node.input(3);
+  const std::string& moving_variance_input = node.input(4);
 
   // Create an array holding the epsilon value (typically, 0.001).
-  const string epsilon_array_name = CreateConstArray<ArrayDataType::kFloat>(
-      model, node.name() + "_epsilon_array", {GetFloatAttr(node, "epsilon")});
+  const std::string epsilon_array_name =
+      CreateConstArray<ArrayDataType::kFloat>(model,
+                                              node.name() + "_epsilon_array",
+                                              {GetFloatAttr(node, "epsilon")});
 
   // Add epsilon to the moving variance.
-  const string epsilon_add_op_name = node.name() + "_epsilon";
+  const std::string epsilon_add_op_name = node.name() + "_epsilon";
   auto* epsilon_add_op = new AddOperator;
   epsilon_add_op->inputs.push_back(moving_variance_input);
   epsilon_add_op->inputs.push_back(epsilon_array_name);
@@ -1821,14 +1826,14 @@ tensorflow::Status ConvertFusedBatchNormOperator(
   model->operators.emplace_back(epsilon_add_op);
 
   // Take the inverse square root of the (variance + epsilon).
-  const string rsqrt_op_name = node.name() + "_rsqrt";
+  const std::string rsqrt_op_name = node.name() + "_rsqrt";
   auto* rsqrt_op = new TensorFlowRsqrtOperator;
   rsqrt_op->inputs.push_back(epsilon_add_op_name);
   rsqrt_op->outputs.push_back(rsqrt_op_name);
   model->operators.emplace_back(rsqrt_op);
 
   // Multiply the result by gamma.
-  const string multiplier = node.name() + "_mul";
+  const std::string multiplier = node.name() + "_mul";
   auto* mul_op = new MulOperator;
   mul_op->inputs.push_back(rsqrt_op_name);
   mul_op->inputs.push_back(gamma_input);
@@ -1966,8 +1971,8 @@ tensorflow::Status ConvertTransposeConvOperator(
         << "].";
   }
 
-  const string& weights_name = node.input(TransposeConvOperator::WEIGHTS);
-  const string& transposed_weights_name = weights_name + "_transposed";
+  const std::string& weights_name = node.input(TransposeConvOperator::WEIGHTS);
+  const std::string& transposed_weights_name = weights_name + "_transposed";
   // Check if a TransposeOperator was already created for these weights
   // (can happen when multiple layers share the same weights).
   const Operator* existing_transpose =
@@ -1980,7 +1985,7 @@ tensorflow::Status ConvertTransposeConvOperator(
     // because they consider this a backward conv, inverting the sense of
     // input/output.)
     TransposeOperator* transpose = new TransposeOperator;
-    string perm_array = CreateConstArray<ArrayDataType::kInt32>(
+    std::string perm_array = CreateConstArray<ArrayDataType::kInt32>(
         model, node.name() + "_transpose_perm", {2, 0, 1, 3});
     transpose->inputs = {weights_name, perm_array};
     transpose->outputs = {transposed_weights_name};
@@ -2137,10 +2142,10 @@ tensorflow::Status ConvertReverseSequenceOperator(
 void StripCaretFromArrayNames(Model* model) {
   for (auto& op : model->operators) {
     for (auto& input : op->inputs) {
-      input = string(absl::StripPrefix(input, "^"));
+      input = std::string(absl::StripPrefix(input, "^"));
     }
     for (auto& output : op->outputs) {
-      output = string(absl::StripPrefix(output, "^"));
+      output = std::string(absl::StripPrefix(output, "^"));
     }
   }
   for (auto& array : model->GetArrayMap()) {
@@ -2152,7 +2157,7 @@ void StripCaretFromArrayNames(Model* model) {
 
 void StripZeroOutputIndexFromInputs(NodeDef* node) {
   for (auto& input : *node->mutable_input()) {
-    input = string(absl::StripSuffix(input, ":0"));
+    input = std::string(absl::StripSuffix(input, ":0"));
   }
 }
 
@@ -2170,15 +2175,15 @@ void StripZeroOutputIndexFromInputs(NodeDef* node) {
 // all nodes, we can use that information.
 void AddExtraOutputs(Model* model) {
   // Construct the list of all arrays consumed by anything in the graph.
-  std::vector<string> consumed_arrays;
+  std::vector<std::string> consumed_arrays;
   // Add arrays consumed by an op.
   for (const auto& consumer_op : model->operators) {
-    for (const string& input : consumer_op->inputs) {
+    for (const std::string& input : consumer_op->inputs) {
       consumed_arrays.push_back(input);
     }
   }
   // Add global outputs of the model.
-  for (const string& output_array : model->flags.output_arrays()) {
+  for (const std::string& output_array : model->flags.output_arrays()) {
     consumed_arrays.push_back(output_array);
   }
   // Add arrays consumed by a RNN back-edge.
@@ -2187,7 +2192,7 @@ void AddExtraOutputs(Model* model) {
   }
   // Now add operator outputs so that all arrays that are consumed,
   // are produced.
-  for (const string& consumed_array : consumed_arrays) {
+  for (const std::string& consumed_array : consumed_arrays) {
     // Test if consumed_array is already the output of some op.
     // This has occurred in a model where separate nodes had names of the form
     // foo:$i with the same base name foo.
@@ -2195,7 +2200,7 @@ void AddExtraOutputs(Model* model) {
       continue;
     }
     // Split the consumed array name into the form name:output_index.
-    const std::vector<string>& split = absl::StrSplit(consumed_array, ':');
+    const std::vector<std::string>& split = absl::StrSplit(consumed_array, ':');
     // If not of the form name:output_index, then this is not an additional
     // output of a node with multiple outputs, so nothing to do here.
     if (split.size() != 2) {
@@ -2288,7 +2293,7 @@ tensorflow::Status ConvertTopKV2Operator(
   op->inputs.push_back(node.input(0));
   // K can be encoded as attr (TopK) convert it to a const.
   if (HasAttr(node, "k")) {
-    string k_array = CreateConstArray<ArrayDataType::kInt32>(
+    std::string k_array = CreateConstArray<ArrayDataType::kInt32>(
         model, node.name() + "k", {static_cast<int32>(GetIntAttr(node, "k"))});
     op->inputs.push_back(k_array);
   } else {
@@ -2346,7 +2351,7 @@ tensorflow::Status ConvertSparseToDenseOperator(
   TF_QCHECK_OK(CheckInputsCount(node, tf_import_flags, 4));
 
   auto* op = new SparseToDenseOperator;
-  for (const string& input : node.input()) {
+  for (const std::string& input : node.input()) {
     op->inputs.push_back(input);
   }
   op->outputs.push_back(node.name());
@@ -2371,7 +2376,7 @@ tensorflow::Status ConvertOneHotOperator(
 
   auto op = absl::make_unique<OneHotOperator>();
   op->axis = HasAttr(node, "axis") ? GetIntAttr(node, "axis") : -1;
-  for (const string& input : node.input()) {
+  for (const std::string& input : node.input()) {
     op->inputs.push_back(input);
   }
   op->outputs.push_back(node.name());
@@ -2386,7 +2391,7 @@ tensorflow::Status ConvertCTCBeamSearchDecoderOperator(
   TF_QCHECK_OK(CheckInputsCount(node, tf_import_flags, 2));
 
   auto* op = new CTCBeamSearchDecoderOperator;
-  for (const string& input : node.input()) {
+  for (const std::string& input : node.input()) {
     op->inputs.push_back(input);
   }
 
@@ -2434,7 +2439,7 @@ tensorflow::Status ConvertUnidirectionalSequenceLstm(
         count++;
       } else {
         // Optional input.
-        string optional_name = node.name() + "_" + std::to_string(idx);
+        std::string optional_name = node.name() + "_" + std::to_string(idx);
         model->CreateOptionalArray(optional_name);
         op->inputs[idx] = optional_name;
       }
@@ -2442,7 +2447,7 @@ tensorflow::Status ConvertUnidirectionalSequenceLstm(
   } else {  // Legacy version.
     std::vector<bool> done(kInputsSize);
     int idx = 0;
-    for (const string& input : node.input()) {
+    for (const std::string& input : node.input()) {
       int real_index = indices.i(idx);
       op->inputs[real_index] = (input);
       done[real_index] = true;
@@ -2451,7 +2456,7 @@ tensorflow::Status ConvertUnidirectionalSequenceLstm(
 
     for (int idx = 0; idx < done.size(); idx++) {
       if (!done[idx]) {
-        string optional_name = node.name() + "_" + std::to_string(idx);
+        std::string optional_name = node.name() + "_" + std::to_string(idx);
         model->CreateOptionalArray(optional_name);
         op->inputs[idx] = optional_name;
       }
@@ -2491,7 +2496,7 @@ tensorflow::Status ConvertUnidirectionalSequenceRnn(
   }
 
   auto* op = new UnidirectionalSequenceRnnOperator();
-  for (const string& input : node.input()) {
+  for (const std::string& input : node.input()) {
     op->inputs.push_back(input);
   }
   // Only use the last one as input.
@@ -2703,7 +2708,8 @@ std::unique_ptr<Model> ImportTensorFlowGraphDef(
         << "Unsupported explicit zero output index: "
         << specified_input_array.name();
   }
-  for (const string& specified_output_array : model_flags.output_arrays()) {
+  for (const std::string& specified_output_array :
+       model_flags.output_arrays()) {
     CHECK(!absl::EndsWith(specified_output_array, ":0"))
         << "Unsupported explicit zero output index: " << specified_output_array;
   }
@@ -2746,7 +2752,7 @@ std::unique_ptr<Model> ImportTensorFlowGraphDef(
 
 std::unique_ptr<Model> ImportTensorFlowGraphDef(
     const ModelFlags& model_flags, const TensorFlowImportFlags& tf_import_flags,
-    const string& input_file_contents) {
+    const std::string& input_file_contents) {
   std::unique_ptr<GraphDef> tf_graph(new GraphDef);
   CHECK(ParseFromStringEitherTextOrBinary(input_file_contents, tf_graph.get()));
 
diff --git a/tensorflow/lite/toco/import_tensorflow.h b/tensorflow/lite/toco/import_tensorflow.h
index 4ada25e2fbe..a95cfee2e75 100644
--- a/tensorflow/lite/toco/import_tensorflow.h
+++ b/tensorflow/lite/toco/import_tensorflow.h
@@ -43,7 +43,7 @@ std::unique_ptr<Model> ImportTensorFlowGraphDef(
 // flags.
 std::unique_ptr<Model> ImportTensorFlowGraphDef(
     const ModelFlags& model_flags, const TensorFlowImportFlags& tf_import_flags,
-    const string& input_file_contents);
+    const std::string& input_file_contents);
 
 // Gets a list of supported ops by their names.
 std::vector<std::string> GetPotentiallySupportedOps();
diff --git a/tensorflow/lite/toco/import_tensorflow_test.cc b/tensorflow/lite/toco/import_tensorflow_test.cc
index eb6ed3fdd74..98ce18bf38e 100644
--- a/tensorflow/lite/toco/import_tensorflow_test.cc
+++ b/tensorflow/lite/toco/import_tensorflow_test.cc
@@ -163,7 +163,7 @@ void BuildConstNode(std::initializer_list<int64_t> shape,
 TEST(FlexImportTest, ConditionalConst) {
   Model model;
   auto build_and_import_node =
-      [&model](const string& name, std::initializer_list<int64_t> shape,
+      [&model](const std::string& name, std::initializer_list<int64_t> shape,
                tensorflow::DataType dtype, int64_t num_elements) {
         NodeDef node;
         BuildConstNode(shape, dtype, num_elements, &node);
@@ -486,8 +486,8 @@ class TensorContentTest : public ::testing::Test {
         break;
     }
     t.set_tensor_content(
-        string(reinterpret_cast<const char*>(allocated_content.get()),
-               num_elements * sizeof(T)));
+        std::string(reinterpret_cast<const char*>(allocated_content.get()),
+                    num_elements * sizeof(T)));
 
     AttrValue value_attr;
     SetAttrValue(t, &value_attr);
diff --git a/tensorflow/lite/toco/logging/conversion_log_util.cc b/tensorflow/lite/toco/logging/conversion_log_util.cc
index c23c305c750..55afa1370b3 100644
--- a/tensorflow/lite/toco/logging/conversion_log_util.cc
+++ b/tensorflow/lite/toco/logging/conversion_log_util.cc
@@ -34,8 +34,8 @@ namespace toco {
 
 namespace {
 
-string TryGetOperatorName(const Operator& op) {
-  string op_name;
+std::string TryGetOperatorName(const Operator& op) {
+  std::string op_name;
   if (!op.tensorflow_node_def.empty()) {
     // Parse op name from serialized NodeDef.
     tensorflow::NodeDef node_def;
@@ -63,8 +63,8 @@ string TryGetOperatorName(const Operator& op) {
   return op_name;
 }
 
-string GetOSVersion() {
-  string os_info;
+std::string GetOSVersion() {
+  std::string os_info;
 #ifdef __linux__
   utsname info;
   if (uname(&info)) {
@@ -72,12 +72,13 @@ string GetOSVersion() {
     LOG(ERROR) << "Cannot get OS info.";
     return "";
   }
-  os_info = string(info.sysname) + ";OSVer=" + string(info.release) + ";";
+  os_info =
+      std::string(info.sysname) + ";OSVer=" + std::string(info.release) + ";";
 #endif
   return os_info;
 }
 
-string ShapeToStringNoSpace(const Shape& shape) {
+std::string ShapeToStringNoSpace(const Shape& shape) {
   if (shape.dimensions_count() == 0) {
     return "[]";
   }
@@ -85,13 +86,13 @@ string ShapeToStringNoSpace(const Shape& shape) {
   return absl::StrCat("[", absl::StrJoin(shape.dims(), ","), "]");
 }
 
-string GetOperatorSignature(
+std::string GetOperatorSignature(
     const Model& model, const Operator& op,
     const std::map<OperatorType, std::unique_ptr<tflite::BaseOperator>>&
         op_types_map) {
   // The signature of an op has the following schema:
   // INPUT:SHAPE::TYPE::OUTPUT:SHAPE::TYPE::NAME:VERSION:
-  string op_signature;
+  std::string op_signature;
   constexpr char delimiter[] = "::";
 
   // Get input shapes and types.
@@ -137,8 +138,8 @@ string GetOperatorSignature(
 
 }  // namespace
 
-std::vector<string> GetOperatorNames(const Model& model) {
-  std::vector<string> op_names;
+std::vector<std::string> GetOperatorNames(const Model& model) {
+  std::vector<std::string> op_names;
   for (const auto& op : model.operators) {
     op_names.push_back(TryGetOperatorName(*op));
   }
@@ -146,9 +147,9 @@ std::vector<string> GetOperatorNames(const Model& model) {
 }
 
 void CountOperatorsByType(const Model& model,
-                          std::map<string, int>* built_in_ops,
-                          std::map<string, int>* custom_ops,
-                          std::map<string, int>* select_ops) {
+                          std::map<std::string, int>* built_in_ops,
+                          std::map<std::string, int>* custom_ops,
+                          std::map<std::string, int>* select_ops) {
   for (const auto& op : model.operators) {
     OperatorSignature op_signature = {op.get(), &model};
     const auto ops_by_type =
@@ -156,7 +157,7 @@ void CountOperatorsByType(const Model& model,
     tflite::details::OperatorKey op_key(op_signature, ops_by_type,
                                         true /*enable_select_tf_ops*/);
 
-    const string op_name = TryGetOperatorName(*op);
+    const std::string op_name = TryGetOperatorName(*op);
     if (op_key.is_custom_op()) {
       (*custom_ops)[op_name]++;
     } else if (op_key.is_flex_op()) {
@@ -168,8 +169,9 @@ void CountOperatorsByType(const Model& model,
 }
 
 void GetInputAndOutputTypes(
-    const Model& model, TFLITE_PROTO_NS::RepeatedPtrField<string>* input_types,
-    TFLITE_PROTO_NS::RepeatedPtrField<string>* output_types) {
+    const Model& model,
+    TFLITE_PROTO_NS::RepeatedPtrField<std::string>* input_types,
+    TFLITE_PROTO_NS::RepeatedPtrField<std::string>* output_types) {
   for (const auto& input_array : model.flags.input_arrays()) {
     const Array& array = model.GetArray(input_array.name());
     input_types->Add(ArrayDataTypeName(array.data_type));
@@ -180,15 +182,16 @@ void GetInputAndOutputTypes(
   }
 }
 
-string GetTfLiteVersion() { return TFLITE_VERSION_STRING; }
+std::string GetTfLiteVersion() { return TFLITE_VERSION_STRING; }
 
-string GetCachedOSVersion() {
-  static string* version = new string(GetOSVersion());
+std::string GetCachedOSVersion() {
+  static std::string* version = new std::string(GetOSVersion());
   return *version;
 }
 
-void GetOpSignatures(const Model& model,
-                     TFLITE_PROTO_NS::RepeatedPtrField<string>* op_signatures) {
+void GetOpSignatures(
+    const Model& model,
+    TFLITE_PROTO_NS::RepeatedPtrField<std::string>* op_signatures) {
   const auto& op_types_map =
       tflite::BuildOperatorByTypeMap(true /*enable_select_tf_ops*/);
   for (const auto& op : model.operators) {
@@ -196,7 +199,7 @@ void GetOpSignatures(const Model& model,
   }
 }
 
-string GetModelHash(const Model& model) {
+std::string GetModelHash(const Model& model) {
   // TODO(b/123519920): Implement the hash function for Model.
   // Need to consider different implementations for public/private models.
   return "";
@@ -204,18 +207,18 @@ string GetModelHash(const Model& model) {
 
 // This function scans through the error message string, extracts the part about
 // missing ops and prunes away all other information in the error info.
-string SanitizeErrorMessage(const string& error_message) {
-  const string s1 = "Ops that can be supported by the flex runtime";
-  const string s2 = "Ops that need custom implementation";
-  string pruned_message;
+std::string SanitizeErrorMessage(const std::string& error_message) {
+  const std::string s1 = "Ops that can be supported by the flex runtime";
+  const std::string s2 = "Ops that need custom implementation";
+  std::string pruned_message;
   size_t pos = error_message.find(s1);
-  if (pos != string::npos) {
+  if (pos != std::string::npos) {
     // Find the terminate point for flex op list.
     auto end = error_message.find(".", pos);
     pruned_message.append(error_message.substr(pos, end - pos + 1));
   }
   pos = error_message.find(s2);
-  if (pos != string::npos) {
+  if (pos != std::string::npos) {
     // Find the terminate point for custom op list.
     auto end = error_message.find(".", pos);
     pruned_message.append(error_message.substr(pos, end - pos + 1));
@@ -225,18 +228,18 @@ string SanitizeErrorMessage(const string& error_message) {
 
 void PopulateConversionLog(const Model& model, TocoConversionLog* log) {
   // Get the list of ops after conversion.
-  const std::vector<string> op_names = GetOperatorNames(model);
+  const std::vector<std::string> op_names = GetOperatorNames(model);
   for (const auto& op_name : op_names) {
     log->add_op_list(op_name);
   }
 
   // Get op signatures.
-  TFLITE_PROTO_NS::RepeatedPtrField<string> op_signatures;
+  TFLITE_PROTO_NS::RepeatedPtrField<std::string> op_signatures;
   GetOpSignatures(model, &op_signatures);
   log->mutable_op_signatures()->CopyFrom(op_signatures);
 
   // Get op counts by category: custom, built-in or select.
-  std::map<string, int> custom_ops, select_ops, built_in_ops;
+  std::map<std::string, int> custom_ops, select_ops, built_in_ops;
   CountOperatorsByType(model, &built_in_ops, &custom_ops, &select_ops);
   log->mutable_custom_ops()->insert(custom_ops.cbegin(), custom_ops.cend());
   log->mutable_built_in_ops()->insert(built_in_ops.cbegin(),
@@ -244,7 +247,7 @@ void PopulateConversionLog(const Model& model, TocoConversionLog* log) {
   log->mutable_select_ops()->insert(select_ops.cbegin(), select_ops.cend());
 
   // Get the model's input and output types.
-  TFLITE_PROTO_NS::RepeatedPtrField<string> input_types, output_types;
+  TFLITE_PROTO_NS::RepeatedPtrField<std::string> input_types, output_types;
   GetInputAndOutputTypes(model, &input_types, &output_types);
   log->mutable_input_tensor_types()->CopyFrom(input_types);
   log->mutable_output_tensor_types()->CopyFrom(output_types);
diff --git a/tensorflow/lite/toco/logging/conversion_log_util.h b/tensorflow/lite/toco/logging/conversion_log_util.h
index 2237615adbb..c21ec0792cc 100644
--- a/tensorflow/lite/toco/logging/conversion_log_util.h
+++ b/tensorflow/lite/toco/logging/conversion_log_util.h
@@ -25,37 +25,39 @@ namespace toco {
 
 // This function scans through the error message string, extracts the part about
 // missing ops and prunes away all other information in the error info.
-string SanitizeErrorMessage(const string& error_message);
+std::string SanitizeErrorMessage(const std::string& error_message);
 
 // Populates the TocoConversionLog proto after analyzing the model.
 void PopulateConversionLog(const Model& model, TocoConversionLog* log);
 
 // Returns the names of the operators in the model.
-std::vector<string> GetOperatorNames(const Model& model);
+std::vector<std::string> GetOperatorNames(const Model& model);
 
 // Counts the number of different types of operators in the model:
 // Built-in ops, custom ops and select ops.
 // Each map is mapping from the name of the operator (such as 'Conv') to its
 // total number of occurrences in the model.
 void CountOperatorsByType(const Model& model,
-                          std::map<string, int>* built_in_ops,
-                          std::map<string, int>* custom_ops,
-                          std::map<string, int>* select_ops);
+                          std::map<std::string, int>* built_in_ops,
+                          std::map<std::string, int>* custom_ops,
+                          std::map<std::string, int>* select_ops);
 
 // Gets the input and output types of the model. The input and output is
 // specified by model.flags.input_arrays and model.flags.output_arrays.
 void GetInputAndOutputTypes(
-    const Model& model, TFLITE_PROTO_NS::RepeatedPtrField<string>* input_types,
-    TFLITE_PROTO_NS::RepeatedPtrField<string>* output_types);
+    const Model& model,
+    TFLITE_PROTO_NS::RepeatedPtrField<std::string>* input_types,
+    TFLITE_PROTO_NS::RepeatedPtrField<std::string>* output_types);
 
 // Calculates signatures for all the ops in the model. An op signature is
 // defined by its input/output shapes and types, op name and its version.
-void GetOpSignatures(const Model& model,
-                     TFLITE_PROTO_NS::RepeatedPtrField<string>* op_signatures);
+void GetOpSignatures(
+    const Model& model,
+    TFLITE_PROTO_NS::RepeatedPtrField<std::string>* op_signatures);
 
 // TODO(b/123519920): Implement this.
 // Calculates a unique hash for the model.
-string GetModelHash(const Model& model);
+std::string GetModelHash(const Model& model);
 
 }  // namespace toco
 
diff --git a/tensorflow/lite/toco/logging/conversion_log_util_test.cc b/tensorflow/lite/toco/logging/conversion_log_util_test.cc
index c4960715f25..17111eca6d0 100644
--- a/tensorflow/lite/toco/logging/conversion_log_util_test.cc
+++ b/tensorflow/lite/toco/logging/conversion_log_util_test.cc
@@ -58,9 +58,9 @@ TEST(ConversionLogUtilTest, TestCountOperatorsByType) {
   Model model;
   // 1st Conv operator.
   std::unique_ptr<ConvOperator> conv1(new ConvOperator());
-  const string conv1_input_name = "conv_input1";
-  const string conv1_filter_name = "conv_filter1";
-  const string conv1_output_name = "conv_output1";
+  const std::string conv1_input_name = "conv_input1";
+  const std::string conv1_filter_name = "conv_filter1";
+  const std::string conv1_output_name = "conv_output1";
   conv1->inputs.push_back(conv1_input_name);
   conv1->inputs.push_back(conv1_filter_name);
   conv1->outputs.push_back(conv1_output_name);
@@ -71,9 +71,9 @@ TEST(ConversionLogUtilTest, TestCountOperatorsByType) {
 
   // 2nd Conv operator.
   std::unique_ptr<ConvOperator> conv2(new ConvOperator());
-  const string conv2_input_name = "conv_input2";
-  const string conv2_filter_name = "conv_filter2";
-  const string conv2_output_name = "conv_output2";
+  const std::string conv2_input_name = "conv_input2";
+  const std::string conv2_filter_name = "conv_filter2";
+  const std::string conv2_output_name = "conv_output2";
   conv2->inputs.push_back(conv2_input_name);
   conv2->inputs.push_back(conv2_filter_name);
   conv2->outputs.push_back(conv2_output_name);
@@ -83,7 +83,7 @@ TEST(ConversionLogUtilTest, TestCountOperatorsByType) {
 
   // Mean operator.
   std::unique_ptr<MeanOperator> mean(new MeanOperator());
-  const string mean_input_name = "mean_input";
+  const std::string mean_input_name = "mean_input";
   mean->inputs.push_back(mean_input_name);
   array_map[mean_input_name] = std::unique_ptr<Array>(new Array);
 
@@ -111,26 +111,26 @@ TEST(ConversionLogUtilTest, TestCountOperatorsByType) {
   model.operators.push_back(std::move(elu_grad));
   model.operators.push_back(std::move(my_custom_op));
 
-  std::map<string, int> built_in_ops, select_ops, custom_ops;
+  std::map<std::string, int> built_in_ops, select_ops, custom_ops;
   CountOperatorsByType(model, &built_in_ops, &custom_ops, &select_ops);
 
   EXPECT_THAT(built_in_ops,
-              UnorderedElementsAre(std::pair<string, int>("Conv", 2),
-                                   std::pair<string, int>("Mean", 1)));
+              UnorderedElementsAre(std::pair<std::string, int>("Conv", 2),
+                                   std::pair<std::string, int>("Mean", 1)));
   EXPECT_THAT(select_ops,
-              UnorderedElementsAre(std::pair<string, int>("AvgPool3D", 1),
-                                   std::pair<string, int>("EluGrad", 1)));
-  EXPECT_THAT(custom_ops, UnorderedElementsAre(
-                              std::pair<string, int>("MyAwesomeCustomOp", 1)));
+              UnorderedElementsAre(std::pair<std::string, int>("AvgPool3D", 1),
+                                   std::pair<std::string, int>("EluGrad", 1)));
+  EXPECT_THAT(custom_ops, UnorderedElementsAre(std::pair<std::string, int>(
+                              "MyAwesomeCustomOp", 1)));
 }
 
 TEST(ConversionLogUtilTest, TestGetInputAndOutputTypes) {
   Model model;
   auto& array_map = model.GetMutableArrayMap();
-  const string input1 = "conv_input";
-  const string input2 = "conv_filter";
-  const string input3 = "feature";
-  const string output = "softmax";
+  const std::string input1 = "conv_input";
+  const std::string input2 = "conv_filter";
+  const std::string input3 = "feature";
+  const std::string output = "softmax";
   array_map[input1] = std::unique_ptr<Array>(new Array);
   array_map[input1]->data_type = ArrayDataType::kFloat;
   array_map[input2] = std::unique_ptr<Array>(new Array);
@@ -149,7 +149,7 @@ TEST(ConversionLogUtilTest, TestGetInputAndOutputTypes) {
   *model.flags.add_input_arrays() = input_arrays[2];
   model.flags.add_output_arrays(output);
 
-  TFLITE_PROTO_NS::RepeatedPtrField<string> input_types, output_types;
+  TFLITE_PROTO_NS::RepeatedPtrField<std::string> input_types, output_types;
   GetInputAndOutputTypes(model, &input_types, &output_types);
 
   EXPECT_THAT(input_types, ElementsAre("float", "float", "int16"));
@@ -161,9 +161,9 @@ TEST(ConversionLogUtilTest, TestGetOpSignatures) {
   auto& array_map = model.GetMutableArrayMap();
 
   std::unique_ptr<ConvOperator> conv(new ConvOperator());
-  const string conv_input_name = "conv_input";
-  const string conv_filter_name = "conv_filter";
-  const string conv_output_name = "conv_output";
+  const std::string conv_input_name = "conv_input";
+  const std::string conv_filter_name = "conv_filter";
+  const std::string conv_output_name = "conv_output";
   conv->inputs.push_back(conv_input_name);
   conv->inputs.push_back(conv_filter_name);
   conv->outputs.push_back(conv_output_name);
@@ -177,15 +177,15 @@ TEST(ConversionLogUtilTest, TestGetOpSignatures) {
   array_map[conv_output_name]->data_type = ArrayDataType::kFloat;
   array_map[conv_output_name]->copy_shape({4, 4, 2});
 
-  const string mean_input_name = "mean_input";
-  const string mean_output_name = "mean_output";
+  const std::string mean_input_name = "mean_input";
+  const std::string mean_output_name = "mean_output";
   std::unique_ptr<MeanOperator> mean(new MeanOperator());
   mean->inputs.push_back(mean_input_name);
   mean->outputs.push_back(mean_output_name);
   array_map[mean_input_name] = std::unique_ptr<Array>(new Array);
   array_map[mean_output_name] = std::unique_ptr<Array>(new Array);
 
-  const string avg_pool_3d_output_name = "avg_pool_output";
+  const std::string avg_pool_3d_output_name = "avg_pool_output";
   auto avg_pool_3d = absl::make_unique<TensorFlowUnsupportedOperator>();
   avg_pool_3d->tensorflow_op = "AvgPool3D";
   tensorflow::NodeDef node_def;
@@ -197,7 +197,7 @@ TEST(ConversionLogUtilTest, TestGetOpSignatures) {
   array_map[avg_pool_3d_output_name]->data_type = ArrayDataType::kInt32;
   array_map[avg_pool_3d_output_name]->copy_shape({2, 2});
 
-  const string custom_op_output_name = "custom_op_output";
+  const std::string custom_op_output_name = "custom_op_output";
   auto my_custom_op = absl::make_unique<TensorFlowUnsupportedOperator>();
   my_custom_op->tensorflow_op = "MyAwesomeCustomOp";
   my_custom_op->inputs.push_back(avg_pool_3d_output_name);
@@ -211,7 +211,7 @@ TEST(ConversionLogUtilTest, TestGetOpSignatures) {
   model.operators.push_back(std::move(avg_pool_3d));
   model.operators.push_back(std::move(my_custom_op));
 
-  TFLITE_PROTO_NS::RepeatedPtrField<string> op_signatures;
+  TFLITE_PROTO_NS::RepeatedPtrField<std::string> op_signatures;
   GetOpSignatures(model, &op_signatures);
   EXPECT_THAT(op_signatures,
               UnorderedElementsAre(
@@ -225,14 +225,14 @@ TEST(ConversionLogUtilTest, TestGetOpSignatures) {
 }
 
 TEST(ConversionLogUtilTest, TestSanitizeErrorMessage) {
-  const string error =
+  const std::string error =
       "error: failed while converting: 'main': Ops that can be supported by "
       "the flex runtime (enabled via setting the -emit-select-tf-ops flag): "
       "ResizeNearestNeighbor,ResizeNearestNeighbor. Ops that need custom "
       "implementation (enabled via setting the -emit-custom-ops flag): "
       "CombinedNonMaxSuppression.\nTraceback (most recent call last): File "
       "/usr/local/bin/toco_from_protos, line 8, in <module>";
-  const string pruned_error =
+  const std::string pruned_error =
       "Ops that can be supported by "
       "the flex runtime (enabled via setting the -emit-select-tf-ops flag): "
       "ResizeNearestNeighbor,ResizeNearestNeighbor.Ops that need custom "
@@ -242,7 +242,7 @@ TEST(ConversionLogUtilTest, TestSanitizeErrorMessage) {
 }
 
 TEST(ConversionLogUtilTest, TestSanitizeErrorMessageNoMatching) {
-  const string error =
+  const std::string error =
       "error: failed while converting: 'main': Traceback (most recent call "
       "last): File "
       "/usr/local/bin/toco_from_protos, line 8, in <module>";
diff --git a/tensorflow/lite/toco/model.h b/tensorflow/lite/toco/model.h
index 89ea9d997f9..58397f5a3eb 100644
--- a/tensorflow/lite/toco/model.h
+++ b/tensorflow/lite/toco/model.h
@@ -287,7 +287,7 @@ struct DataTypeImpl<ArrayDataType::kUint64> {
 };
 template <>
 struct DataTypeImpl<ArrayDataType::kString> {
-  typedef string Type;
+  typedef std::string Type;
 };
 template <>
 struct DataTypeImpl<ArrayDataType::kComplex64> {
@@ -398,10 +398,10 @@ struct Operator {
   // names to addresses is given by the Model, which owns both Operator's and
   // Array's. Thus, an Operator on its own doesn't contain much information,
   // it is meant to be used in conjunction with the Model that owns it.
-  std::vector<string> inputs;
+  std::vector<std::string> inputs;
 
   // Output activation arrays. Same comments as for inputs apply here too.
-  std::vector<string> outputs;
+  std::vector<std::string> outputs;
 
   // If true, the operator has more outputs than are listed in the 'outputs'
   // member. These need to be resolved by some graph transformation.
@@ -415,7 +415,7 @@ struct Operator {
   // It's guaranteed to be filled for `TensorFlowUnsupportedOperator`.
   // It's not guaranteed to be filled for other ops. Ops created by graph
   // transformations won't have TensorFlow NodeDef.
-  string tensorflow_node_def;
+  std::string tensorflow_node_def;
 
  protected:
   // Constructor used by subclasses for specific OperatorType's.
@@ -1693,7 +1693,7 @@ struct TensorFlowUnsupportedOperator : Operator {
   TensorFlowUnsupportedOperator() : Operator(OperatorType::kUnsupported) {}
 
   // The original TF operation type. Used for diagnostic purposes.
-  string tensorflow_op;
+  std::string tensorflow_op;
   // A boolean indicating if the unsupported op should be treated as quantized.
   bool quantized = false;
   // A boolean indicating if the unsupported op output should allow float values
@@ -2393,14 +2393,16 @@ struct Array {
 // Owns everything.
 class Model {
  public:
-  using ArrayMap = std::unordered_map<string, std::unique_ptr<Array>>;
+  using ArrayMap = std::unordered_map<std::string, std::unique_ptr<Array>>;
 
-  bool HasArray(const string& name) const { return arrays.count(name) > 0; }
-  Array& GetArray(const string& name) const {
+  bool HasArray(const std::string& name) const {
+    return arrays.count(name) > 0;
+  }
+  Array& GetArray(const std::string& name) const {
     DCHECK(HasArray(name)) << "Array not found: " << name;
     return *arrays.at(name);
   }
-  Array& GetOrCreateArray(const string& name) {
+  Array& GetOrCreateArray(const std::string& name) {
     // Make sure name is not used by an optional array
     DCHECK(!optional_arrays.count(name));
     if (!HasArray(name)) {
@@ -2410,17 +2412,17 @@ class Model {
     Array& result = GetArray(name);
     return result;
   }
-  void CreateOptionalArray(const string& name) {
+  void CreateOptionalArray(const std::string& name) {
     DCHECK(!arrays.count(name) && !optional_arrays.count(name));
     optional_arrays.insert(name);
   }
-  bool IsOptionalArray(const string& name) const {
+  bool IsOptionalArray(const std::string& name) const {
     return optional_arrays.count(name);
   }
 
   // Note that this invalidates all array iterators.
-  void EraseArray(const string& name) { arrays.erase(name); }
-  void EraseArrays(std::function<bool(const string&)> discardable) {
+  void EraseArray(const std::string& name) { arrays.erase(name); }
+  void EraseArrays(std::function<bool(const std::string&)> discardable) {
     for (auto it = arrays.begin(); it != arrays.end();) {
       if (discardable(it->first)) {
         it = arrays.erase(it);
@@ -2434,17 +2436,17 @@ class Model {
 
   int64 ArithmeticOpsCount() const { return ops_count; }
 
-  void AddInvalidInputArray(string invalid_input_array) {
+  void AddInvalidInputArray(std::string invalid_input_array) {
     invalid_input_arrays_.insert(invalid_input_array);
   }
 
-  const std::unordered_set<string>& GetInvalidInputArrays() const {
+  const std::unordered_set<std::string>& GetInvalidInputArrays() const {
     return invalid_input_arrays_;
   }
 
   // Optional arrays are used for optional tensors,
   // these tensors do not have data, but with reserved names as op inputs.
-  std::set<string> optional_arrays;
+  std::set<std::string> optional_arrays;
 
   // The list of operators. Notice how it's a list of unique_ptr's, implying
   // that the Model is what owns Operator's and keeps them alive.
@@ -2467,10 +2469,10 @@ class Model {
   // that the Model is what owns Array's and keeps them alive.
   // The Operator's refer to these Array's by their name strings, not by their
   // addresses. See Operator::inputs, Operator::outputs.
-  std::unordered_map<string, std::unique_ptr<Array>> arrays;
+  std::unordered_map<std::string, std::unique_ptr<Array>> arrays;
 
   // Invalid input arrays.
-  std::unordered_set<string> invalid_input_arrays_;
+  std::unordered_set<std::string> invalid_input_arrays_;
 };
 
 // OperatorSignature contains the information required to making versioning
diff --git a/tensorflow/lite/toco/model_cmdline_flags.cc b/tensorflow/lite/toco/model_cmdline_flags.cc
index 86a1cedd612..d3c48aee2fe 100644
--- a/tensorflow/lite/toco/model_cmdline_flags.cc
+++ b/tensorflow/lite/toco/model_cmdline_flags.cc
@@ -36,7 +36,7 @@ limitations under the License.
 namespace toco {
 
 bool ParseModelFlagsFromCommandLineFlags(
-    int* argc, char* argv[], string* msg,
+    int* argc, char* argv[], std::string* msg,
     ParsedModelFlags* parsed_model_flags_ptr) {
   ParsedModelFlags& parsed_flags = *parsed_model_flags_ptr;
   using tensorflow::Flag;
@@ -188,7 +188,7 @@ void ReadModelFlagsFromCommandLineFlags(
   // Load proto containing the initial model flags.
   // Additional flags specified on the command line will overwrite the values.
   if (parsed_model_flags.model_flags_file.specified()) {
-    string model_flags_file_contents;
+    std::string model_flags_file_contents;
     QCHECK(port::file::GetContents(parsed_model_flags.model_flags_file.value(),
                                    &model_flags_file_contents,
                                    port::file::Defaults())
@@ -217,9 +217,9 @@ void ReadModelFlagsFromCommandLineFlags(
   }
 
   if (parsed_model_flags.output_arrays.specified()) {
-    std::vector<string> output_arrays =
+    std::vector<std::string> output_arrays =
         absl::StrSplit(parsed_model_flags.output_arrays.value(), ',');
-    for (const string& output_array : output_arrays) {
+    for (const std::string& output_array : output_arrays) {
       model_flags->add_output_arrays(output_array);
     }
   }
@@ -251,7 +251,7 @@ void ReadModelFlagsFromCommandLineFlags(
     QCHECK(uses_multi_input_flags);
     for (const auto& input_array :
          absl::StrSplit(parsed_model_flags.input_arrays.value(), ',')) {
-      model_flags->add_input_arrays()->set_name(string(input_array));
+      model_flags->add_input_arrays()->set_name(std::string(input_array));
     }
   }
   if (parsed_model_flags.mean_value.specified()) {
@@ -261,7 +261,7 @@ void ReadModelFlagsFromCommandLineFlags(
   }
   if (parsed_model_flags.mean_values.specified()) {
     QCHECK(uses_multi_input_flags);
-    std::vector<string> mean_values =
+    std::vector<std::string> mean_values =
         absl::StrSplit(parsed_model_flags.mean_values.value(), ',');
     QCHECK(mean_values.size() == model_flags->input_arrays_size());
     for (size_t i = 0; i < mean_values.size(); ++i) {
@@ -278,7 +278,7 @@ void ReadModelFlagsFromCommandLineFlags(
   }
   if (parsed_model_flags.std_values.specified()) {
     QCHECK(uses_multi_input_flags);
-    std::vector<string> std_values =
+    std::vector<std::string> std_values =
         absl::StrSplit(parsed_model_flags.std_values.value(), ',');
     QCHECK(std_values.size() == model_flags->input_arrays_size());
     for (size_t i = 0; i < std_values.size(); ++i) {
@@ -296,7 +296,7 @@ void ReadModelFlagsFromCommandLineFlags(
   }
   if (parsed_model_flags.input_data_types.specified()) {
     QCHECK(uses_multi_input_flags);
-    std::vector<string> input_data_types =
+    std::vector<std::string> input_data_types =
         absl::StrSplit(parsed_model_flags.input_data_types.value(), ',');
     QCHECK(input_data_types.size() == model_flags->input_arrays_size());
     for (size_t i = 0; i < input_data_types.size(); ++i) {
@@ -319,7 +319,7 @@ void ReadModelFlagsFromCommandLineFlags(
   }
   if (parsed_model_flags.input_shapes.specified()) {
     QCHECK(uses_multi_input_flags);
-    std::vector<string> input_shapes =
+    std::vector<std::string> input_shapes =
         absl::StrSplit(parsed_model_flags.input_shapes.value(), ':');
     QCHECK(input_shapes.size() == model_flags->input_arrays_size());
     for (size_t i = 0; i < input_shapes.size(); ++i) {
@@ -352,8 +352,8 @@ void ReadModelFlagsFromCommandLineFlags(
   for (const auto& element : parsed_model_flags.rnn_states.value().elements) {
     auto* rnn_state_proto = model_flags->add_rnn_states();
     for (const auto& kv_pair : element) {
-      const string& key = kv_pair.first;
-      const string& value = kv_pair.second;
+      const std::string& key = kv_pair.first;
+      const std::string& value = kv_pair.second;
       if (key == "state_array") {
         rnn_state_proto->set_state_array(value);
       } else if (key == "back_edge_source_array") {
@@ -377,8 +377,8 @@ void ReadModelFlagsFromCommandLineFlags(
   for (const auto& element : parsed_model_flags.model_checks.value().elements) {
     auto* model_check_proto = model_flags->add_model_checks();
     for (const auto& kv_pair : element) {
-      const string& key = kv_pair.first;
-      const string& value = kv_pair.second;
+      const std::string& key = kv_pair.first;
+      const std::string& value = kv_pair.second;
       if (key == "count_type") {
         model_check_proto->set_count_type(value);
       } else if (key == "count_min") {
@@ -411,7 +411,7 @@ void ReadModelFlagsFromCommandLineFlags(
   }
 
   if (parsed_model_flags.arrays_extra_info_file.specified()) {
-    string arrays_extra_info_file_contents;
+    std::string arrays_extra_info_file_contents;
     CHECK(port::file::GetContents(
               parsed_model_flags.arrays_extra_info_file.value(),
               &arrays_extra_info_file_contents, port::file::Defaults())
@@ -443,7 +443,7 @@ void ParseModelFlagsOrDie(int* argc, char* argv[]) {
   // TODO(aselle): in the future allow Google version to use
   // flags, and only use this mechanism for open source
   auto* flags = UncheckedGlobalParsedModelFlags(false);
-  string msg;
+  std::string msg;
   bool model_success =
       toco::ParseModelFlagsFromCommandLineFlags(argc, argv, &msg, flags);
   if (!model_success || !msg.empty()) {
diff --git a/tensorflow/lite/toco/model_cmdline_flags.h b/tensorflow/lite/toco/model_cmdline_flags.h
index 1642e053199..23e79e62047 100644
--- a/tensorflow/lite/toco/model_cmdline_flags.h
+++ b/tensorflow/lite/toco/model_cmdline_flags.h
@@ -28,7 +28,7 @@ namespace toco {
 // is successful. msg has the usage string if there was an error or
 // "--help" was specified
 bool ParseModelFlagsFromCommandLineFlags(
-    int* argc, char* argv[], string* msg,
+    int* argc, char* argv[], std::string* msg,
     ParsedModelFlags* parsed_model_flags_ptr);
 // Populate the ModelFlags proto with model data.
 void ReadModelFlagsFromCommandLineFlags(
diff --git a/tensorflow/lite/toco/model_cmdline_flags_test.cc b/tensorflow/lite/toco/model_cmdline_flags_test.cc
index bff8e4843a0..b87e200095c 100644
--- a/tensorflow/lite/toco/model_cmdline_flags_test.cc
+++ b/tensorflow/lite/toco/model_cmdline_flags_test.cc
@@ -35,8 +35,8 @@ TEST(ModelCmdlineFlagsTest, ParseArgsStringMapList) {
       "back_edge_source_array:rnn/basic_lstm_cell/Mul_2,size:4}",
       nullptr};
 
-  string expected_input_arrays = "input_1";
-  std::vector<std::unordered_map<string, string>> expected_rnn_states;
+  std::string expected_input_arrays = "input_1";
+  std::vector<std::unordered_map<std::string, std::string>> expected_rnn_states;
   expected_rnn_states.push_back(
       {{"state_array", "rnn/BasicLSTMCellZeroState/zeros"},
        {"back_edge_source_array", "rnn/basic_lstm_cell/Add_1"},
@@ -46,7 +46,7 @@ TEST(ModelCmdlineFlagsTest, ParseArgsStringMapList) {
        {"back_edge_source_array", "rnn/basic_lstm_cell/Mul_2"},
        {"size", "4"}});
 
-  string message;
+  std::string message;
   ParsedModelFlags result_flags;
 
   EXPECT_TRUE(ParseModelFlagsFromCommandLineFlags(
diff --git a/tensorflow/lite/toco/python/toco_python_api.cc b/tensorflow/lite/toco/python/toco_python_api.cc
index 3f3d301a40d..0f21d0854ae 100644
--- a/tensorflow/lite/toco/python/toco_python_api.cc
+++ b/tensorflow/lite/toco/python/toco_python_api.cc
@@ -47,9 +47,9 @@ namespace toco {
 
 void PopulateConversionLogHelper(const toco::ModelFlags& model_flags,
                                  toco::TocoFlags* toco_flags,
-                                 const string& input_contents_txt,
-                                 const string& output_file_contents_txt,
-                                 const string& error_message,
+                                 const std::string& input_contents_txt,
+                                 const std::string& output_file_contents_txt,
+                                 const std::string& error_message,
                                  GraphVizDumpOptions* dump_options) {
   // Make sure the graphviz file will be dumped under the same folder.
   dump_options->dump_graphviz = toco_flags->conversion_summary_dir();
@@ -167,7 +167,7 @@ PyObject* TocoConvert(PyObject* model_flags_proto_txt_raw,
     dump_options.dump_graphviz_video = toco_flags.dump_graphviz_include_video();
   }
 
-  string output_file_contents_txt;
+  std::string output_file_contents_txt;
   tensorflow::Status status;
   int64 arithmetic_ops_count;
 
@@ -221,7 +221,7 @@ PyObject* TocoGetPotentiallySupportedOps() {
   std::vector<std::string> supported_ops = toco::GetPotentiallySupportedOps();
   PyObject* list = PyList_New(supported_ops.size());
   for (size_t i = 0; i < supported_ops.size(); ++i) {
-    const string& op = supported_ops[i];
+    const std::string& op = supported_ops[i];
     PyObject* op_dict = PyDict_New();
     PyDict_SetItemString(op_dict, "op", PyUnicode_FromString(op.c_str()));
     PyList_SetItem(list, i, op_dict);
diff --git a/tensorflow/lite/toco/tensorflow_util.cc b/tensorflow/lite/toco/tensorflow_util.cc
index db9388b040c..bf5d8016857 100644
--- a/tensorflow/lite/toco/tensorflow_util.cc
+++ b/tensorflow/lite/toco/tensorflow_util.cc
@@ -37,16 +37,16 @@ namespace toco {
 using tensorflow::AttrValue;
 using tensorflow::GraphDef;
 
-void LogDumpGraphDef(int log_level, const string& message,
+void LogDumpGraphDef(int log_level, const std::string& message,
                      const GraphDef& tf_graph) {
   if (!VLOG_IS_ON(log_level)) {
     return;
   }
-  std::set<string> ops;
+  std::set<std::string> ops;
   for (const auto& node : tf_graph.node()) {
     ops.insert(node.op());
   }
-  string dump;
+  std::string dump;
   toco::port::AppendF(&dump, R"MSG(
 BEGIN DUMP OF TENSORFLOW GRAPHDEF (%s)
 There are %d nodes.
diff --git a/tensorflow/lite/toco/tensorflow_util.h b/tensorflow/lite/toco/tensorflow_util.h
index 010fbe88b21..6abad52b9cb 100644
--- a/tensorflow/lite/toco/tensorflow_util.h
+++ b/tensorflow/lite/toco/tensorflow_util.h
@@ -24,7 +24,7 @@ limitations under the License.
 
 namespace toco {
 
-void LogDumpGraphDef(int log_level, const string& message,
+void LogDumpGraphDef(int log_level, const std::string& message,
                      const tensorflow::GraphDef& tf_graph);
 
 }  // namespace toco
diff --git a/tensorflow/lite/toco/tflite/export.cc b/tensorflow/lite/toco/tflite/export.cc
index d72a902001d..d109ab875b5 100644
--- a/tensorflow/lite/toco/tflite/export.cc
+++ b/tensorflow/lite/toco/tflite/export.cc
@@ -52,7 +52,7 @@ using ::tflite::Tensor;
 namespace {
 
 // Check if a TensorFlow Op is a control flow op by its name.
-bool IsControlFlowOp(const string& tensorflow_op) {
+bool IsControlFlowOp(const std::string& tensorflow_op) {
   // Technically this is equivalent to `::tensorflow::Node::IsControlFlow()`.
   // It requires to construct a `::tensorflow::Graph` to use that helper
   // function, so we simply hardcode the list of control flow ops here.
@@ -68,7 +68,7 @@ bool IsControlFlowOp(const string& tensorflow_op) {
 }
 
 // Check if a TensorFlow Op is unsupported by the Flex runtime.
-bool IsUnsupportedFlexOp(const string& tensorflow_op) {
+bool IsUnsupportedFlexOp(const std::string& tensorflow_op) {
   if (IsControlFlowOp(tensorflow_op)) {
     return true;
   }
@@ -82,14 +82,14 @@ bool IsUnsupportedFlexOp(const string& tensorflow_op) {
 }
 
 // Map from operator name to TF Lite enum value, for all builtins.
-const std::map<string, BuiltinOperator>& GetBuiltinOpsMap() {
-  static std::map<string, BuiltinOperator>* builtin_ops = nullptr;
+const std::map<std::string, BuiltinOperator>& GetBuiltinOpsMap() {
+  static std::map<std::string, BuiltinOperator>* builtin_ops = nullptr;
   if (builtin_ops == nullptr) {
-    builtin_ops = new std::map<string, BuiltinOperator>();
+    builtin_ops = new std::map<std::string, BuiltinOperator>();
 
     for (int i = BuiltinOperator_MIN; i <= BuiltinOperator_MAX; ++i) {
       BuiltinOperator op = static_cast<BuiltinOperator>(i);
-      string name = EnumNameBuiltinOperator(op);
+      std::string name = EnumNameBuiltinOperator(op);
       if (op != BuiltinOperator_CUSTOM && !name.empty()) {
         (*builtin_ops)[name] = op;
       }
@@ -99,10 +99,10 @@ const std::map<string, BuiltinOperator>& GetBuiltinOpsMap() {
 }
 
 void WriteModelToString(const flatbuffers::FlatBufferBuilder& builder,
-                        string* file_contents) {
+                        std::string* file_contents) {
   const uint8_t* buffer = builder.GetBufferPointer();
   int size = builder.GetSize();
-  *file_contents = string(reinterpret_cast<const char*>(buffer), size);
+  *file_contents = std::string(reinterpret_cast<const char*>(buffer), size);
 }
 
 }  // Anonymous namespace.
@@ -115,7 +115,7 @@ OperatorKey::OperatorKey(
     bool enable_select_tf_ops) {
   // Get the op name (by Toco definition).
   const ::toco::Operator& op = *op_signature.op;
-  string name = HelpfulOperatorTypeName(op);
+  std::string name = HelpfulOperatorTypeName(op);
 
   bool is_builtin = false;
   const auto& builtin_ops = GetBuiltinOpsMap();
@@ -146,7 +146,7 @@ OperatorKey::OperatorKey(
       is_flex_op_ = true;
       flex_tensorflow_op_ = tensorflow_op;
       custom_code_ =
-          string(::tflite::kFlexCustomCodePrefix) + flex_tensorflow_op_;
+          std::string(::tflite::kFlexCustomCodePrefix) + flex_tensorflow_op_;
     } else {
       custom_code_ = tensorflow_op;
     }
@@ -158,7 +158,7 @@ OperatorKey::OperatorKey(
     is_flex_op_ = true;
     flex_tensorflow_op_ = name;
     custom_code_ =
-        string(::tflite::kFlexCustomCodePrefix) + flex_tensorflow_op_;
+        std::string(::tflite::kFlexCustomCodePrefix) + flex_tensorflow_op_;
   } else {
     // If Flex is disabled or the original TensorFlow NodeDef isn't available,
     // we produce a custom op. This gives developers a chance to implement
@@ -175,7 +175,7 @@ OperatorKey::OperatorKey(
 
 void LoadTensorsMap(const Model& model, TensorsMap* tensors_map) {
   // First find a list of unique array names.
-  std::set<string> names;
+  std::set<std::string> names;
   for (const auto& array_pair : model.GetArrayMap()) {
     names.insert(array_pair.first);
   }
@@ -218,7 +218,7 @@ Offset<Vector<Offset<Tensor>>> ExportTensors(
   std::map<int, Offset<Tensor>> ordered_tensors;
 
   for (const auto& array_pair : model.GetArrayMap()) {
-    const string& tensor_name = array_pair.first;
+    const std::string& tensor_name = array_pair.first;
     const toco::Array& array = *array_pair.second;
 
     int buffer_index = buffers_to_write->size();
@@ -283,7 +283,7 @@ Offset<Vector<int32_t>> ExportOutputTensors(
     const Model& model, const details::TensorsMap& tensors_map,
     FlatBufferBuilder* builder) {
   std::vector<int32_t> outputs;
-  for (const string& output : model.flags.output_arrays()) {
+  for (const std::string& output : model.flags.output_arrays()) {
     outputs.push_back(tensors_map.at(output));
   }
   return builder->CreateVector<int32_t>(outputs);
@@ -295,10 +295,10 @@ Offset<Vector<Offset<OperatorCode>>> ExportOperatorCodes(
     const details::OperatorsMap& operators_map, FlatBufferBuilder* builder,
     const ExportParams& params) {
   // Map from operator name to TF Lite enum value, for all builtins.
-  std::map<string, BuiltinOperator> builtin_ops;
+  std::map<std::string, BuiltinOperator> builtin_ops;
   for (int i = BuiltinOperator_MIN; i <= BuiltinOperator_MAX; ++i) {
     BuiltinOperator op = static_cast<BuiltinOperator>(i);
-    string name = EnumNameBuiltinOperator(op);
+    std::string name = EnumNameBuiltinOperator(op);
     if (op != BuiltinOperator_CUSTOM && !name.empty()) {
       builtin_ops[name] = op;
     }
@@ -349,13 +349,13 @@ Offset<Vector<Offset<Operator>>> ExportOperators(
   std::vector<Offset<Operator>> op_vector;
   for (const auto& op : model.operators) {
     std::vector<int32_t> inputs;
-    for (const string& input : op->inputs) {
+    for (const std::string& input : op->inputs) {
       // -1 is the ID for optional tensor in TFLite output
       int id = model.IsOptionalArray(input) ? -1 : tensors_map.at(input);
       inputs.push_back(id);
     }
     std::vector<int32_t> outputs;
-    for (const string& output : op->outputs) {
+    for (const std::string& output : op->outputs) {
       outputs.push_back(tensors_map.at(output));
     }
     const toco::OperatorSignature op_signature = {op.get(), &model};
@@ -428,15 +428,15 @@ Offset<Vector<Offset<Buffer>>> ExportBuffers(
   return builder->CreateVector(buffer_vector);
 }
 
-tensorflow::Status Export(const Model& model, string* output_file_contents,
+tensorflow::Status Export(const Model& model, std::string* output_file_contents,
                           const ExportParams& params) {
   const auto ops_by_type = BuildOperatorByTypeMap(params.enable_select_tf_ops);
   return Export(model, output_file_contents, params, ops_by_type);
 }
 
-void ParseControlFlowErrors(std::set<string>* custom_ops,
-                            std::vector<string>* error_msgs) {
-  std::set<string> unsupported_control_flow_ops;
+void ParseControlFlowErrors(std::set<std::string>* custom_ops,
+                            std::vector<std::string>* error_msgs) {
+  std::set<std::string> unsupported_control_flow_ops;
   // Check if unsupported ops contains control flow ops. It's impossible
   // to implement these ops as custom ops at the moment.
   for (const auto& op : *custom_ops) {
@@ -471,10 +471,10 @@ void ExportModelVersionBuffer(
 }
 
 tensorflow::Status Export(
-    const Model& model, string* output_file_contents,
+    const Model& model, std::string* output_file_contents,
     const ExportParams& params,
     const std::map<OperatorType, std::unique_ptr<BaseOperator>>& ops_by_type) {
-  for (const string& input_array : model.GetInvalidInputArrays()) {
+  for (const std::string& input_array : model.GetInvalidInputArrays()) {
     if (model.HasArray(input_array)) {
       return tensorflow::errors::InvalidArgument(
           absl::StrCat("Placeholder ", input_array,
@@ -509,11 +509,11 @@ tensorflow::Status Export(
   }
 
   // The set of used builtin ops.
-  std::set<string> builtin_ops;
+  std::set<std::string> builtin_ops;
   // The set of custom ops (not including Flex ops).
-  std::set<string> custom_ops;
+  std::set<std::string> custom_ops;
   // The set of Flex ops which are not supported.
-  std::set<string> unsupported_flex_ops;
+  std::set<std::string> unsupported_flex_ops;
 
   for (const auto& it : operators_map) {
     const details::OperatorKey& key = it.first;
@@ -540,7 +540,7 @@ tensorflow::Status Export(
                "40-tflite-op-request.md\n and pasting the following:\n\n";
       };
 
-      std::vector<string> error_msgs;
+      std::vector<std::string> error_msgs;
       ParseControlFlowErrors(&custom_ops, &error_msgs);
 
       // Remove ExpandDims and ReorderAxes from unimplemented list unless they
@@ -549,7 +549,7 @@ tensorflow::Status Export(
       // transformation is unable to run because the output shape is not
       // defined. This causes unnecessary confusion during model conversion
       // time.
-      std::set<string> custom_ops_final;
+      std::set<std::string> custom_ops_final;
       for (const auto& op_type : custom_ops) {
         if (op_type != "ReorderAxes" && op_type != "ExpandDims") {
           custom_ops_final.insert(op_type);
diff --git a/tensorflow/lite/toco/tflite/export.h b/tensorflow/lite/toco/tflite/export.h
index 3af77ffcf43..64f7c7b128f 100644
--- a/tensorflow/lite/toco/tflite/export.h
+++ b/tensorflow/lite/toco/tflite/export.h
@@ -35,19 +35,19 @@ struct ExportParams {
 
 // Transform the given tf.mini model into a TF Lite flatbuffer and deposit the
 // result in the given string.
-tensorflow::Status Export(const Model& model, string* output_file_contents,
+tensorflow::Status Export(const Model& model, std::string* output_file_contents,
                           const ExportParams& params);
 
 // Export API with custom TFLite operator mapping.
 tensorflow::Status Export(
-    const Model& model, string* output_file_contents,
+    const Model& model, std::string* output_file_contents,
     const ExportParams& params,
     const std::map<OperatorType, std::unique_ptr<BaseOperator>>& ops_by_type);
 
 // This is for backward-compatibility.
 // TODO(ycling): Remove the deprecated entry functions.
 inline void Export(const Model& model, bool allow_custom_ops,
-                   bool quantize_weights, string* output_file_contents) {
+                   bool quantize_weights, std::string* output_file_contents) {
   ExportParams params;
   params.allow_custom_ops = allow_custom_ops;
   params.quantize_weights =
@@ -60,7 +60,7 @@ inline void Export(const Model& model, bool allow_custom_ops,
 // TODO(ycling): Remove the deprecated entry functions.
 inline void Export(
     const Model& model, bool allow_custom_ops, bool quantize_weights,
-    string* output_file_contents,
+    std::string* output_file_contents,
     const std::map<OperatorType, std::unique_ptr<BaseOperator>>& ops_by_type) {
   ExportParams params;
   params.allow_custom_ops = allow_custom_ops;
@@ -72,7 +72,7 @@ inline void Export(
 
 // This is for backward-compatibility.
 // TODO(ycling): Remove the deprecated entry functions.
-inline void Export(const Model& model, string* output_file_contents) {
+inline void Export(const Model& model, std::string* output_file_contents) {
   ExportParams params;
   params.allow_custom_ops = true;
   auto status = Export(model, output_file_contents, params);
@@ -82,7 +82,7 @@ inline void Export(const Model& model, string* output_file_contents) {
 namespace details {
 
 // A map from tensor name to its final position in the TF Lite buffer.
-using TensorsMap = std::unordered_map<string, int>;
+using TensorsMap = std::unordered_map<std::string, int>;
 
 // A key to identify an operator.
 // Only when `type` is `kUnsupported`, `custom_code` is filled to
diff --git a/tensorflow/lite/toco/tflite/export_test.cc b/tensorflow/lite/toco/tflite/export_test.cc
index 19b77543c66..ed347a28d51 100644
--- a/tensorflow/lite/toco/tflite/export_test.cc
+++ b/tensorflow/lite/toco/tflite/export_test.cc
@@ -34,13 +34,13 @@ using ::testing::HasSubstr;
 class ExportTest : public ::testing::Test {
  protected:
   void ResetOperators() { input_model_.operators.clear(); }
-  void AddTensorsByName(std::initializer_list<string> names) {
-    for (const string& name : names) {
+  void AddTensorsByName(std::initializer_list<std::string> names) {
+    for (const std::string& name : names) {
       input_model_.GetOrCreateArray(name);
     }
   }
-  void AddOperatorsByName(std::initializer_list<string> names) {
-    for (const string& name : names) {
+  void AddOperatorsByName(std::initializer_list<std::string> names) {
+    for (const std::string& name : names) {
       if (name == "Conv") {
         auto* op = new ConvOperator;
         op->padding.type = PaddingType::kSame;
@@ -153,14 +153,15 @@ class ExportTest : public ::testing::Test {
   }
 
   tensorflow::Status ExportAndReturnStatus(const ExportParams& params) {
-    string result;
+    std::string result;
     return Export(input_model_, &result, params);
   }
 
-  std::vector<string> ExportAndSummarizeOperators(const ExportParams& params) {
-    std::vector<string> names;
+  std::vector<std::string> ExportAndSummarizeOperators(
+      const ExportParams& params) {
+    std::vector<std::string> names;
 
-    string result;
+    std::string result;
     auto status = Export(input_model_, &result, params);
     if (!status.ok()) {
       LOG(INFO) << status.error_message();
@@ -171,10 +172,12 @@ class ExportTest : public ::testing::Test {
 
     for (const ::tflite::OperatorCode* opcode : *model->operator_codes()) {
       if (opcode->builtin_code() != ::tflite::BuiltinOperator_CUSTOM) {
-        names.push_back(string("builtin:") + ::tflite::EnumNameBuiltinOperator(
-                                                 opcode->builtin_code()));
+        names.push_back(
+            std::string("builtin:") +
+            ::tflite::EnumNameBuiltinOperator(opcode->builtin_code()));
       } else {
-        names.push_back(string("custom:") + opcode->custom_code()->c_str());
+        names.push_back(std::string("custom:") +
+                        opcode->custom_code()->c_str());
       }
     }
 
@@ -185,7 +188,7 @@ class ExportTest : public ::testing::Test {
       const ExportParams& params) {
     std::vector<uint32_t> indices;
 
-    string result;
+    std::string result;
     if (!Export(input_model_, &result, params).ok()) return indices;
     auto* model = ::tflite::GetModel(result.data());
 
@@ -257,7 +260,7 @@ TEST_F(ExportTest, ExportMinRuntime) {
   params.enable_select_tf_ops = false;
   params.quantize_weights = QuantizedBufferType::NONE;
 
-  string output;
+  std::string output;
   auto status = Export(input_model_, &output, params);
   auto* model = ::tflite::GetModel(output.data());
   EXPECT_EQ(model->metadata()->size(), 1);
@@ -265,7 +268,8 @@ TEST_F(ExportTest, ExportMinRuntime) {
   auto buf = model->metadata()->Get(0)->buffer();
   auto* buffer = (*model->buffers())[buf];
   auto* array = buffer->data();
-  string version(reinterpret_cast<const char*>(array->data()), array->size());
+  std::string version(reinterpret_cast<const char*>(array->data()),
+                      array->size());
   EXPECT_EQ(version, "1.6.0");
 }
 
@@ -275,7 +279,7 @@ TEST_F(ExportTest, ExportEmptyMinRuntime) {
   ExportParams params;
   params.allow_custom_ops = true;
 
-  string output;
+  std::string output;
   auto status = Export(input_model_, &output, params);
   auto* model = ::tflite::GetModel(output.data());
   EXPECT_EQ(model->metadata()->size(), 1);
@@ -283,7 +287,8 @@ TEST_F(ExportTest, ExportEmptyMinRuntime) {
   auto buf = model->metadata()->Get(0)->buffer();
   auto* buffer = (*model->buffers())[buf];
   auto* array = buffer->data();
-  string version(reinterpret_cast<const char*>(array->data()), array->size());
+  std::string version(reinterpret_cast<const char*>(array->data()),
+                      array->size());
   EXPECT_EQ(version, "");
 }
 
@@ -296,7 +301,7 @@ TEST_F(ExportTest, UnsupportedControlFlowErrors) {
   // The model contains control flow ops which are not convertible, so we should
   // check the returned error message.
 
-  string output;
+  std::string output;
   const auto ops_by_type = BuildOperatorByTypeMap();
   auto status = Export(input_model_, &output, params, ops_by_type);
   EXPECT_EQ(status.error_message(),
@@ -318,7 +323,7 @@ TEST_F(ExportTest, UnsupportedOpsAndNeedEnableFlex) {
   params.allow_custom_ops = false;
   params.enable_select_tf_ops = false;
 
-  string output;
+  std::string output;
   const auto ops_by_type = BuildOperatorByTypeMap();
   auto status = Export(input_model_, &output, params, ops_by_type);
   EXPECT_EQ(
@@ -348,7 +353,7 @@ TEST_F(ExportTest, UnsupportedOpsNeedCustomImplementation) {
   params.allow_custom_ops = false;
   params.enable_select_tf_ops = true;
 
-  string output;
+  std::string output;
   const auto ops_by_type = BuildOperatorByTypeMap();
   auto status = Export(input_model_, &output, params, ops_by_type);
   EXPECT_EQ(
@@ -378,7 +383,7 @@ TEST_F(ExportTest, UnsupportedControlFlowAndCustomOpsErrors) {
   // The model contains control flow ops which are not convertible, so we should
   // check the returned error message.
 
-  string output;
+  std::string output;
   const auto ops_by_type = BuildOperatorByTypeMap();
   auto status = Export(input_model_, &output, params, ops_by_type);
   EXPECT_EQ(
@@ -407,11 +412,11 @@ TEST_F(ExportTest, UnsupportedControlFlowAndCustomOpsErrors) {
 TEST_F(ExportTest, QuantizeWeights) {
   // Sanity check for quantize_weights parameter.
   BuildQuantizableTestModel();
-  string unquantized_result;
+  std::string unquantized_result;
   Export(input_model_, true, /*quantize_weights*/ false, &unquantized_result);
 
   BuildQuantizableTestModel();
-  string quantized_result;
+  std::string quantized_result;
   Export(input_model_, true, /*quantize_weights*/ true, &quantized_result);
 
   // The quantized models should be smaller.
@@ -443,12 +448,13 @@ class OpSetsTest : public ExportTest {
     }
   }
 
-  std::vector<string> ImportExport(std::initializer_list<string> op_names) {
+  std::vector<std::string> ImportExport(
+      std::initializer_list<std::string> op_names) {
     ResetOperators();
     if (!import_all_ops_as_unsupported_) {
       AddOperatorsByName(op_names);
     } else {
-      for (const string& name : op_names) {
+      for (const std::string& name : op_names) {
         auto* op = new TensorFlowUnsupportedOperator;
         op->tensorflow_op = name;
         input_model_.operators.emplace_back(op);
@@ -644,7 +650,7 @@ TEST_F(VersionedOpExportTest, Export) {
   AddConvOp(false);
   AddConvOp(true);
 
-  string result;
+  std::string result;
   const auto ops_by_type = BuildFakeOperatorByTypeMap();
   Export(input_model_, true, false, &result, ops_by_type);
 
diff --git a/tensorflow/lite/toco/tflite/import.cc b/tensorflow/lite/toco/tflite/import.cc
index 0f3dd48652e..136aa4ffaa8 100644
--- a/tensorflow/lite/toco/tflite/import.cc
+++ b/tensorflow/lite/toco/tflite/import.cc
@@ -99,7 +99,7 @@ void ImportTensors(const ::tflite::Model& input_model, Model* model) {
 
 void ImportOperators(
     const ::tflite::Model& input_model,
-    const std::map<string, std::unique_ptr<BaseOperator>>& ops_by_name,
+    const std::map<std::string, std::unique_ptr<BaseOperator>>& ops_by_name,
     const details::TensorsTable& tensors_table,
     const details::OperatorsTable& operators_table, Model* model) {
   // TODO(aselle): add support for multiple subgraphs.
@@ -112,12 +112,12 @@ void ImportOperators(
       LOG(FATAL) << "Index " << index << " must be between zero and "
                  << operators_table.size();
     }
-    string opname = operators_table.at(index);
+    std::string opname = operators_table.at(index);
 
     // Find and use the appropriate operator deserialization factory.
     std::unique_ptr<Operator> new_op = nullptr;
     if (ops_by_name.count(opname) == 0) {
-      string effective_opname = "TENSORFLOW_UNSUPPORTED";
+      std::string effective_opname = "TENSORFLOW_UNSUPPORTED";
       if (ops_by_name.count(effective_opname) == 0) {
         LOG(FATAL) << "Internal logic error: TENSORFLOW_UNSUPPORTED not found.";
       }
@@ -147,10 +147,10 @@ void ImportOperators(
       auto input_index = inputs->Get(i);
       // input_index == -1 indicates optional tensor.
       if (input_index != -1) {
-        const string& input_name = tensors_table.at(input_index);
+        const std::string& input_name = tensors_table.at(input_index);
         op->inputs.push_back(input_name);
       } else {
-        const string& tensor_name =
+        const std::string& tensor_name =
             toco::AvailableArrayName(*model, "OptionalTensor");
         model->CreateOptionalArray(tensor_name);
         op->inputs.push_back(tensor_name);
@@ -159,7 +159,7 @@ void ImportOperators(
     auto outputs = input_op->outputs();
     for (int i = 0; i < outputs->Length(); i++) {
       auto output_index = outputs->Get(i);
-      const string& output_name = tensors_table.at(output_index);
+      const std::string& output_name = tensors_table.at(output_index);
       op->outputs.push_back(output_name);
     }
   }
@@ -173,7 +173,7 @@ void ImportIOTensors(const ModelFlags& model_flags,
     auto inputs = (*input_model.subgraphs())[0]->inputs();
     if (inputs) {
       for (int input : *inputs) {
-        const string& input_name = tensors_table.at(input);
+        const std::string& input_name = tensors_table.at(input);
         model->flags.add_input_arrays()->set_name(input_name);
       }
     }
@@ -184,7 +184,7 @@ void ImportIOTensors(const ModelFlags& model_flags,
     auto outputs = (*input_model.subgraphs())[0]->outputs();
     if (outputs) {
       for (int output : *outputs) {
-        const string& output_name = tensors_table.at(output);
+        const std::string& output_name = tensors_table.at(output);
         model->flags.add_output_arrays(output_name);
       }
     }
@@ -199,7 +199,7 @@ bool Verify(const void* buf, size_t len) {
 }  // namespace
 
 std::unique_ptr<Model> Import(const ModelFlags& model_flags,
-                              const string& input_file_contents) {
+                              const std::string& input_file_contents) {
   ::tflite::AlwaysTrueResolver r;
   if (!::tflite::Verify(input_file_contents.data(), input_file_contents.size(),
                         r, ::tflite::DefaultErrorReporter())) {
diff --git a/tensorflow/lite/toco/tflite/import.h b/tensorflow/lite/toco/tflite/import.h
index f5de3b53b5b..bac55aae8b9 100644
--- a/tensorflow/lite/toco/tflite/import.h
+++ b/tensorflow/lite/toco/tflite/import.h
@@ -24,17 +24,17 @@ namespace tflite {
 
 // Parse the given string as TF Lite flatbuffer and return a new tf.mini model.
 std::unique_ptr<Model> Import(const ModelFlags &model_flags,
-                              const string &input_file_contents);
+                              const std::string &input_file_contents);
 
 namespace details {
 
 // The names of all tensors found in a TF Lite model.
-using TensorsTable = std::vector<string>;
+using TensorsTable = std::vector<std::string>;
 
 // The names of all operators found in TF Lite model. If the operator is
 // builtin, the string representation of the corresponding enum value is used
 // as name.
-using OperatorsTable = std::vector<string>;
+using OperatorsTable = std::vector<std::string>;
 
 void LoadTensorsTable(const ::tflite::Model &input_model,
                       TensorsTable *tensors_table);
diff --git a/tensorflow/lite/toco/tflite/import_test.cc b/tensorflow/lite/toco/tflite/import_test.cc
index b00c4124d83..6163ebab45b 100644
--- a/tensorflow/lite/toco/tflite/import_test.cc
+++ b/tensorflow/lite/toco/tflite/import_test.cc
@@ -134,9 +134,9 @@ class ImportTest : public ::testing::Test {
 
     input_model_ = ::tflite::GetModel(builder_.GetBufferPointer());
   }
-  string InputModelAsString() {
-    return string(reinterpret_cast<char*>(builder_.GetBufferPointer()),
-                  builder_.GetSize());
+  std::string InputModelAsString() {
+    return std::string(reinterpret_cast<char*>(builder_.GetBufferPointer()),
+                       builder_.GetSize());
   }
   flatbuffers::FlatBufferBuilder builder_;
   const ::tflite::Model* input_model_ = nullptr;
diff --git a/tensorflow/lite/toco/tflite/op_version.cc b/tensorflow/lite/toco/tflite/op_version.cc
index cf127a9f459..efa53c69cae 100644
--- a/tensorflow/lite/toco/tflite/op_version.cc
+++ b/tensorflow/lite/toco/tflite/op_version.cc
@@ -29,7 +29,7 @@ namespace tflite {
 
 // Deprecated and please register new ops/versions in
 // tflite/tools/versioning/op_version.cc".
-string GetMinimumRuntimeVersionForModel(const Model& model) {
+std::string GetMinimumRuntimeVersionForModel(const Model& model) {
   // Use this as the placeholder string if a particular op is not yet included
   // in any Tensorflow's RC/Final release source package. Once that op is
   // included in the release, please update this with the real version string.
@@ -37,8 +37,8 @@ string GetMinimumRuntimeVersionForModel(const Model& model) {
   // A map from the version key of an op to its minimum runtime version.
   // For example, {{kAveragePool, 1}, "1.5.0"},  means the 1st version of
   // AveragePool requires a minimum TF Lite runtime version '1.5.0`.
-  static const std::map<std::pair<OperatorType, int>, string>* op_version_map =
-      new std::map<std::pair<OperatorType, int>, string>({
+  static const std::map<std::pair<OperatorType, int>, std::string>*
+      op_version_map = new std::map<std::pair<OperatorType, int>, std::string>({
           {{OperatorType::kAveragePool, 1}, "1.5.0"},
           {{OperatorType::kAveragePool, 2}, "1.14.0"},
           {{OperatorType::kAveragePool, 3}, kPendingReleaseOpVersion},
@@ -253,7 +253,7 @@ string GetMinimumRuntimeVersionForModel(const Model& model) {
       tflite::BuildOperatorByTypeMap(false /*enable_select_tf_ops=*/);
   OperatorSignature op_signature;
   op_signature.model = &model;
-  string model_min_version;
+  std::string model_min_version;
   for (const auto& op : model.operators) {
     if (op_types_map.find(op->type) == op_types_map.end()) continue;
     op_signature.op = op.get();
diff --git a/tensorflow/lite/toco/tflite/op_version_test.cc b/tensorflow/lite/toco/tflite/op_version_test.cc
index 14b086471b7..8466fc35ad7 100644
--- a/tensorflow/lite/toco/tflite/op_version_test.cc
+++ b/tensorflow/lite/toco/tflite/op_version_test.cc
@@ -27,9 +27,9 @@ TEST(OpVersionTest, MinimumVersionForSameOpVersions) {
   Model model;
   // Float convolutional kernel is introduced since '1.5.0'.
   std::unique_ptr<ConvOperator> conv(new ConvOperator());
-  const string conv_input = "conv_input";
-  const string conv_filter = "conv_filter";
-  const string conv_output = "conv_output";
+  const std::string conv_input = "conv_input";
+  const std::string conv_filter = "conv_filter";
+  const std::string conv_output = "conv_output";
   conv->inputs.push_back(conv_input);
   conv->inputs.push_back(conv_filter);
   conv->outputs.push_back(conv_output);
@@ -44,8 +44,8 @@ TEST(OpVersionTest, MinimumVersionForSameOpVersions) {
 
   // Float softmax kernel is introduced since '1.5.0'.
   std::unique_ptr<SoftmaxOperator> softmax(new SoftmaxOperator());
-  const string softmax_input = "softmax_input";
-  const string softmax_output = "softmax_output";
+  const std::string softmax_input = "softmax_input";
+  const std::string softmax_output = "softmax_output";
   softmax->inputs.push_back(softmax_input);
   softmax->outputs.push_back(softmax_output);
   array_map[softmax_input] = std::unique_ptr<Array>(new Array);
@@ -60,9 +60,9 @@ TEST(OpVersionTest, MinimumVersionForMultipleOpVersions) {
   Model model;
   // Dilated DepthWiseConvolution is introduced since '1.12.0'.
   std::unique_ptr<DepthwiseConvOperator> conv(new DepthwiseConvOperator());
-  const string conv_input = "conv_input";
-  const string conv_filter = "conv_filter";
-  const string conv_output = "conv_output";
+  const std::string conv_input = "conv_input";
+  const std::string conv_filter = "conv_filter";
+  const std::string conv_output = "conv_output";
   conv->inputs.push_back(conv_input);
   conv->inputs.push_back(conv_filter);
   conv->outputs.push_back(conv_output);
@@ -77,10 +77,10 @@ TEST(OpVersionTest, MinimumVersionForMultipleOpVersions) {
   // FullyConnected op with kShuffled4x16Int8 weight format is introduced from
   // '1.10.0'.
   std::unique_ptr<FullyConnectedOperator> fc(new FullyConnectedOperator());
-  const string fc_input = "fc_input";
-  const string fc_weights = "fc_weights";
-  const string fc_bias = "fc_bias";
-  const string fc_output = "fc_output";
+  const std::string fc_input = "fc_input";
+  const std::string fc_weights = "fc_weights";
+  const std::string fc_bias = "fc_bias";
+  const std::string fc_output = "fc_output";
   fc->inputs.push_back(fc_input);
   fc->inputs.push_back(fc_weights);
   fc->inputs.push_back(fc_bias);
@@ -121,10 +121,10 @@ TEST(OpVersionTest, MinimumVersionForMixedOpVersions) {
   // FullyConnected op with kShuffled4x16Int8 weight format is introduced from
   // '1.10.0'.
   std::unique_ptr<FullyConnectedOperator> fc(new FullyConnectedOperator());
-  const string fc_input = "fc_input";
-  const string fc_weights = "fc_weights";
-  const string fc_bias = "fc_bias";
-  const string fc_output = "fc_output";
+  const std::string fc_input = "fc_input";
+  const std::string fc_weights = "fc_weights";
+  const std::string fc_bias = "fc_bias";
+  const std::string fc_output = "fc_output";
   fc->inputs.push_back(fc_input);
   fc->inputs.push_back(fc_weights);
   fc->inputs.push_back(fc_bias);
diff --git a/tensorflow/lite/toco/tflite/operator.cc b/tensorflow/lite/toco/tflite/operator.cc
index fee10a19787..be539cf6054 100644
--- a/tensorflow/lite/toco/tflite/operator.cc
+++ b/tensorflow/lite/toco/tflite/operator.cc
@@ -238,7 +238,7 @@ class SpaceToBatchND
                    TocoOperator* op) const override {}
 
   int GetVersion(const OperatorSignature& op_signature) const override {
-    const string& input_name = op_signature.op->inputs[0];
+    const std::string& input_name = op_signature.op->inputs[0];
     const Array& input_array = op_signature.model->GetArray(input_name);
     ::tflite::OpSignature op_sig =
         GetVersioningOpSig(builtin_op(), op_signature);
@@ -268,8 +268,8 @@ class Sub : public BuiltinOperator<SubOperator, ::tflite::SubOptions,
   }
 
   int GetVersion(const OperatorSignature& op_signature) const override {
-    const string& input1_name = op_signature.op->inputs[0];
-    const string& input2_name = op_signature.op->inputs[1];
+    const std::string& input1_name = op_signature.op->inputs[0];
+    const std::string& input2_name = op_signature.op->inputs[1];
     const Array& input1_array = op_signature.model->GetArray(input1_name);
     const Array& input2_array = op_signature.model->GetArray(input2_name);
     ::tflite::OpSignature op_sig =
@@ -305,8 +305,8 @@ class Div : public BuiltinOperator<DivOperator, ::tflite::DivOptions,
   }
 
   int GetVersion(const OperatorSignature& op_signature) const override {
-    const string& input1_name = op_signature.op->inputs[0];
-    const string& input2_name = op_signature.op->inputs[1];
+    const std::string& input1_name = op_signature.op->inputs[0];
+    const std::string& input2_name = op_signature.op->inputs[1];
     const Array& input1_array = op_signature.model->GetArray(input1_name);
     const Array& input2_array = op_signature.model->GetArray(input2_name);
     ::tflite::OpSignature op_sig =
@@ -339,7 +339,7 @@ class BatchToSpaceND
                    TocoOperator* op) const override {}
 
   int GetVersion(const OperatorSignature& op_signature) const override {
-    const string& input_name = op_signature.op->inputs[0];
+    const std::string& input_name = op_signature.op->inputs[0];
     const Array& input_array = op_signature.model->GetArray(input_name);
     ::tflite::OpSignature op_sig =
         GetVersioningOpSig(builtin_op(), op_signature);
@@ -662,9 +662,9 @@ class Mul : public BuiltinOperator<MulOperator, ::tflite::MulOptions,
   }
 
   int GetVersion(const OperatorSignature& op_signature) const override {
-    const string& input1_name = op_signature.op->inputs[0];
-    const string& input2_name = op_signature.op->inputs[1];
-    const string& output_name = op_signature.op->outputs[0];
+    const std::string& input1_name = op_signature.op->inputs[0];
+    const std::string& input2_name = op_signature.op->inputs[1];
+    const std::string& output_name = op_signature.op->outputs[0];
     const Array& input1_array = op_signature.model->GetArray(input1_name);
     const Array& input2_array = op_signature.model->GetArray(input2_name);
     const Array& output_array = op_signature.model->GetArray(output_name);
@@ -1440,7 +1440,7 @@ class Unpack : public BuiltinOperator<UnpackOperator, ::tflite::UnpackOptions,
   }
 
   int GetVersion(const OperatorSignature& op_signature) const override {
-    const string& input_name = op_signature.op->inputs[0];
+    const std::string& input_name = op_signature.op->inputs[0];
     const Array& input_array = op_signature.model->GetArray(input_name);
     // If the op take int8/uint8 input, it is version 2.
     if (input_array.data_type == ArrayDataType::kInt8 ||
@@ -1577,7 +1577,7 @@ class Where : public BuiltinOperator<WhereOperator, ::tflite::WhereOptions,
 };
 
 std::unique_ptr<flexbuffers::Builder> WriteFlexOpOptions(
-    const string& tensorflow_node_def) {
+    const std::string& tensorflow_node_def) {
   auto fbb = absl::make_unique<flexbuffers::Builder>();
 
   ::tensorflow::NodeDef node_def;
@@ -1597,7 +1597,7 @@ std::unique_ptr<flexbuffers::Builder> WriteFlexOpOptions(
 
 class TensorFlowUnsupported : public BaseOperator {
  public:
-  TensorFlowUnsupported(const string& name, OperatorType type,
+  TensorFlowUnsupported(const std::string& name, OperatorType type,
                         bool enable_select_tf_ops)
       : BaseOperator(name, type), enable_select_tf_ops_(enable_select_tf_ops) {}
 
@@ -1676,7 +1676,7 @@ class TensorFlowUnsupported : public BaseOperator {
         case tensorflow::AttrValue::kList:
           if (attr.list().s_size() > 0) {
             auto start = fbb->StartVector(key);
-            for (const string& v : attr.list().s()) {
+            for (const std::string& v : attr.list().s()) {
               fbb->Add(v);
             }
             fbb->EndVector(start, /*typed=*/true, /*fixed=*/false);
@@ -1736,10 +1736,11 @@ class TensorFlowUnsupported : public BaseOperator {
           break;
         case flexbuffers::FBT_BOOL:
           (*attr)[key].set_b(value.AsBool());
-          if (string(key) == "_output_quantized") {
+          if (std::string(key) == "_output_quantized") {
             op->quantized = value.AsBool();
           }
-          if (string(key) == "_support_output_type_float_in_quantized_op") {
+          if (std::string(key) ==
+              "_support_output_type_float_in_quantized_op") {
             op->support_output_type_float_in_quantized_op = value.AsBool();
           }
           break;
@@ -2095,9 +2096,9 @@ std::map<OperatorType, std::unique_ptr<BaseOperator>> BuildOperatorByTypeMap(
   return result;
 }
 
-std::map<string, std::unique_ptr<BaseOperator>> BuildOperatorByNameMap(
+std::map<std::string, std::unique_ptr<BaseOperator>> BuildOperatorByNameMap(
     bool enable_select_tf_ops) {
-  std::map<string, std::unique_ptr<BaseOperator>> result;
+  std::map<std::string, std::unique_ptr<BaseOperator>> result;
 
   std::vector<std::unique_ptr<BaseOperator>> ops =
       BuildOperatorList(enable_select_tf_ops);
@@ -2109,7 +2110,7 @@ std::map<string, std::unique_ptr<BaseOperator>> BuildOperatorByNameMap(
 }
 
 bool ShouldExportAsFlexOp(bool enable_select_tf_ops,
-                          const string& tensorflow_op_name) {
+                          const std::string& tensorflow_op_name) {
   // If Flex ops aren't allow at all, simply return false.
   if (!enable_select_tf_ops) {
     return false;
diff --git a/tensorflow/lite/toco/tflite/operator.h b/tensorflow/lite/toco/tflite/operator.h
index 19d92145e0c..fb79b97f46e 100644
--- a/tensorflow/lite/toco/tflite/operator.h
+++ b/tensorflow/lite/toco/tflite/operator.h
@@ -30,7 +30,7 @@ class BaseOperator;
 // Return a map contained all know TF Lite Operators, keyed by their names.
 // TODO(ycling): The pattern to propagate parameters (e.g. enable_select_tf_ops)
 // is ugly here. Consider refactoring.
-std::map<string, std::unique_ptr<BaseOperator>> BuildOperatorByNameMap(
+std::map<std::string, std::unique_ptr<BaseOperator>> BuildOperatorByNameMap(
     bool enable_select_tf_ops = false);
 
 // Return a map contained all know TF Lite Operators, keyed by the type of
@@ -41,7 +41,7 @@ std::map<OperatorType, std::unique_ptr<BaseOperator>> BuildOperatorByTypeMap(
 // Write the custom option FlexBuffer with a serialized TensorFlow NodeDef
 // for a Flex op.
 std::unique_ptr<flexbuffers::Builder> WriteFlexOpOptions(
-    const string& tensorflow_node_def);
+    const std::string& tensorflow_node_def);
 
 // These are the flatbuffer types for custom and builtin options.
 using CustomOptions = flatbuffers::Vector<uint8_t>;
@@ -71,11 +71,11 @@ struct Options {
 class BaseOperator {
  public:
   // Build an operator with the given TF Lite name and tf.mini type.
-  BaseOperator(const string& name, OperatorType type)
+  BaseOperator(const std::string& name, OperatorType type)
       : name_(name), type_(type) {}
   virtual ~BaseOperator() = default;
 
-  string name() const { return name_; }
+  std::string name() const { return name_; }
   OperatorType type() const { return type_; }
 
   // Given a tf.mini operator, create the corresponding flatbuffer options and
@@ -111,7 +111,7 @@ class BaseOperator {
   }
 
  private:
-  string name_;
+  std::string name_;
   OperatorType type_;
 };
 
@@ -123,7 +123,7 @@ class BaseOperator {
 // Helper function to determine if a unsupported TensorFlow op should be
 // exported as an Flex op or a regular custom op.
 bool ShouldExportAsFlexOp(bool enable_select_tf_ops,
-                          const string& tensorflow_op_name);
+                          const std::string& tensorflow_op_name);
 
 }  // namespace tflite
 
diff --git a/tensorflow/lite/toco/tflite/operator_test.cc b/tensorflow/lite/toco/tflite/operator_test.cc
index a4fe01e4afd..cb466fef079 100644
--- a/tensorflow/lite/toco/tflite/operator_test.cc
+++ b/tensorflow/lite/toco/tflite/operator_test.cc
@@ -30,8 +30,8 @@ namespace {
 class OperatorTest : public ::testing::Test {
  protected:
   // Return the operator for the given name and type.
-  const BaseOperator& GetOperator(const string& name, OperatorType type) {
-    using OpsByName = std::map<string, std::unique_ptr<BaseOperator>>;
+  const BaseOperator& GetOperator(const std::string& name, OperatorType type) {
+    using OpsByName = std::map<std::string, std::unique_ptr<BaseOperator>>;
     using OpsByType = std::map<OperatorType, std::unique_ptr<BaseOperator>>;
 
     static auto* by_name = new OpsByName(BuildOperatorByNameMap());
@@ -86,7 +86,7 @@ class OperatorTest : public ::testing::Test {
   // Verify serialization and deserialization of simple operators (those
   // that don't have any configuration parameters).
   template <typename T>
-  void CheckSimpleOperator(const string& name, OperatorType type) {
+  void CheckSimpleOperator(const std::string& name, OperatorType type) {
     Options options;
     auto output_toco_op =
         SerializeAndDeserialize(GetOperator(name, type), T(), &options);
@@ -99,7 +99,7 @@ class OperatorTest : public ::testing::Test {
   }
 
   template <typename T>
-  void CheckReducerOperator(const string& name, OperatorType type) {
+  void CheckReducerOperator(const std::string& name, OperatorType type) {
     T op;
 
     op.keep_dims = false;
diff --git a/tensorflow/lite/toco/tflite/types.cc b/tensorflow/lite/toco/tflite/types.cc
index 96cad557baf..9d4ab8434d1 100644
--- a/tensorflow/lite/toco/tflite/types.cc
+++ b/tensorflow/lite/toco/tflite/types.cc
@@ -25,7 +25,7 @@ DataBuffer::FlatBufferOffset CopyStringToBuffer(
     const Array& array, flatbuffers::FlatBufferBuilder* builder) {
   const auto& src_data = array.GetBuffer<ArrayDataType::kString>().data;
   ::tflite::DynamicBuffer dyn_buffer;
-  for (const string& str : src_data) {
+  for (const std::string& str : src_data) {
     dyn_buffer.AddString(str.c_str(), str.length());
   }
   char* tensor_buffer;
@@ -58,12 +58,12 @@ DataBuffer::FlatBufferOffset CopyBuffer(
 
 void CopyStringFromBuffer(const ::tflite::Buffer& buffer, Array* array) {
   auto* src_data = reinterpret_cast<const char*>(buffer.data()->data());
-  std::vector<string>* dst_data =
+  std::vector<std::string>* dst_data =
       &array->GetMutableBuffer<ArrayDataType::kString>().data;
   int32_t num_strings = ::tflite::GetStringCount(src_data);
   for (int i = 0; i < num_strings; i++) {
     ::tflite::StringRef str_ref = ::tflite::GetString(src_data, i);
-    string this_str(str_ref.str, str_ref.len);
+    std::string this_str(str_ref.str, str_ref.len);
     dst_data->push_back(this_str);
   }
 }
diff --git a/tensorflow/lite/toco/toco.cc b/tensorflow/lite/toco/toco.cc
index aa7e43350ca..18800c7b726 100644
--- a/tensorflow/lite/toco/toco.cc
+++ b/tensorflow/lite/toco/toco.cc
@@ -21,7 +21,7 @@ limitations under the License.
 #include "tensorflow/lite/toco/toco_convert.h"
 
 int main(int argc, char** argv) {
-  toco::string msg;
+  std::string msg;
   toco::ParsedTocoFlags parsed_toco_flags;
   toco::ParsedModelFlags parsed_model_flags;
 
diff --git a/tensorflow/lite/toco/toco_cmdline_flags.cc b/tensorflow/lite/toco/toco_cmdline_flags.cc
index c133db8f2a4..223cfd40775 100644
--- a/tensorflow/lite/toco/toco_cmdline_flags.cc
+++ b/tensorflow/lite/toco/toco_cmdline_flags.cc
@@ -29,7 +29,7 @@ limitations under the License.
 namespace toco {
 
 bool ParseTocoFlagsFromCommandLineFlags(
-    int* argc, char* argv[], string* msg,
+    int* argc, char* argv[], std::string* msg,
     ParsedTocoFlags* parsed_toco_flags_ptr) {
   using tensorflow::Flag;
   ParsedTocoFlags& parsed_flags = *parsed_toco_flags_ptr;
@@ -212,7 +212,7 @@ enum class FlagRequirement {
 
 // Enforces the FlagRequirements are met for a given flag.
 template <typename T>
-void EnforceFlagRequirement(const T& flag, const string& flag_name,
+void EnforceFlagRequirement(const T& flag, const std::string& flag_name,
                             FlagRequirement requirement) {
   if (requirement == FlagRequirement::kMustBeSpecified) {
     QCHECK(flag.specified()) << "Missing required flag " << flag_name;
@@ -317,7 +317,7 @@ void ReadTocoFlagsFromCommandLineFlags(const ParsedTocoFlags& parsed_toco_flags,
            "type of input arrays, use --input_data_type. If you are trying to "
            "control the quantization/dequantization of real-numbers input "
            "arrays in the output file, use --inference_input_type.";
-    std::vector<string> input_types =
+    std::vector<std::string> input_types =
         absl::StrSplit(parsed_toco_flags.input_types.value(), ',');
     QCHECK(!input_types.empty());
     for (int i = 1; i < input_types.size(); i++) {
diff --git a/tensorflow/lite/toco/toco_cmdline_flags.h b/tensorflow/lite/toco/toco_cmdline_flags.h
index cf57055abc2..278c49d25e3 100644
--- a/tensorflow/lite/toco/toco_cmdline_flags.h
+++ b/tensorflow/lite/toco/toco_cmdline_flags.h
@@ -25,7 +25,8 @@ namespace toco {
 // Parse and remove arguments handled from toco. Returns true if parsing
 // is successful. msg has the usage string if there was an error or
 // "--help" was specified
-bool ParseTocoFlagsFromCommandLineFlags(int* argc, char* argv[], string* msg,
+bool ParseTocoFlagsFromCommandLineFlags(int* argc, char* argv[],
+                                        std::string* msg,
                                         ParsedTocoFlags* parsed_toco_flags_ptr);
 // Populate the TocoFlags proto with parsed_toco_flags data.
 void ReadTocoFlagsFromCommandLineFlags(const ParsedTocoFlags& parsed_toco_flags,
diff --git a/tensorflow/lite/toco/toco_cmdline_flags_test.cc b/tensorflow/lite/toco/toco_cmdline_flags_test.cc
index a1066e063bc..c1d0e2f7e9b 100644
--- a/tensorflow/lite/toco/toco_cmdline_flags_test.cc
+++ b/tensorflow/lite/toco/toco_cmdline_flags_test.cc
@@ -29,7 +29,7 @@ TEST(TocoCmdlineFlagsTest, DefaultValue) {
   // TF flag parsing lib is relaying on this invariant.
   const char* args[] = {"toco", nullptr};
 
-  string message;
+  std::string message;
   ParsedTocoFlags result_flags;
 
   EXPECT_TRUE(ParseTocoFlagsFromCommandLineFlags(
@@ -41,7 +41,7 @@ TEST(TocoCmdlineFlagsTest, ParseFlags) {
   int argc = 2;
   const char* args[] = {"toco", "--allow_dynamic_tensors=false", nullptr};
 
-  string message;
+  std::string message;
   ParsedTocoFlags result_flags;
 
   EXPECT_TRUE(ParseTocoFlagsFromCommandLineFlags(
diff --git a/tensorflow/lite/toco/toco_convert.cc b/tensorflow/lite/toco/toco_convert.cc
index 62dacef0b60..0e04b9ce143 100644
--- a/tensorflow/lite/toco/toco_convert.cc
+++ b/tensorflow/lite/toco/toco_convert.cc
@@ -32,7 +32,7 @@ namespace toco {
 namespace {
 
 // Checks the permissions of the output file to ensure it is writeable.
-void CheckOutputFilePermissions(const Arg<string>& output_file) {
+void CheckOutputFilePermissions(const Arg<std::string>& output_file) {
   QCHECK(output_file.specified()) << "Missing required flag --output_file.\n";
   QCHECK(port::file::Writable(output_file.value()).ok())
       << "Specified output_file is not writable: " << output_file.value()
@@ -40,7 +40,7 @@ void CheckOutputFilePermissions(const Arg<string>& output_file) {
 }
 
 // Checks the permissions of the frozen model file.
-void CheckFrozenModelPermissions(const Arg<string>& input_file) {
+void CheckFrozenModelPermissions(const Arg<std::string>& input_file) {
   QCHECK(input_file.specified()) << "Missing required flag --input_file.\n";
   QCHECK(port::file::Exists(input_file.value(), port::file::Defaults()).ok())
       << "Specified input_file does not exist: " << input_file.value() << ".\n";
@@ -55,7 +55,7 @@ void CheckFrozenModelPermissions(const Arg<string>& input_file) {
 void ReadInputData(const ParsedTocoFlags& parsed_toco_flags,
                    const ParsedModelFlags& parsed_model_flags,
                    TocoFlags* toco_flags, ModelFlags* model_flags,
-                   string* graph_def_contents) {
+                   std::string* graph_def_contents) {
   port::CheckInitGoogleIsDone("InitGoogle is not done yet.\n");
 
   // Ensure savedmodel_directory is not set.
@@ -71,10 +71,10 @@ void ReadInputData(const ParsedTocoFlags& parsed_toco_flags,
 }
 }  // namespace
 
-tensorflow::Status Convert(const string& graph_def_contents,
+tensorflow::Status Convert(const std::string& graph_def_contents,
                            const TocoFlags& toco_flags,
                            const ModelFlags& model_flags,
-                           string* output_file_contents,
+                           std::string* output_file_contents,
                            int64* arithmetic_ops_count = nullptr) {
   std::unique_ptr<Model> model =
       Import(toco_flags, model_flags, graph_def_contents);
@@ -95,12 +95,12 @@ tensorflow::Status Convert(const ParsedTocoFlags& parsed_toco_flags,
   TocoFlags toco_flags;
   ReadTocoFlagsFromCommandLineFlags(parsed_toco_flags, &toco_flags);
 
-  string graph_def_contents;
+  std::string graph_def_contents;
   ReadInputData(parsed_toco_flags, parsed_model_flags, &toco_flags,
                 &model_flags, &graph_def_contents);
   CheckOutputFilePermissions(parsed_toco_flags.output_file);
 
-  string output_file_contents;
+  std::string output_file_contents;
   TF_RETURN_IF_ERROR(Convert(graph_def_contents, toco_flags, model_flags,
                              &output_file_contents));
 
diff --git a/tensorflow/lite/toco/toco_convert.h b/tensorflow/lite/toco/toco_convert.h
index 4e3ffe5119b..85abcfcc3bb 100644
--- a/tensorflow/lite/toco/toco_convert.h
+++ b/tensorflow/lite/toco/toco_convert.h
@@ -22,10 +22,10 @@ limitations under the License.
 
 namespace toco {
 
-tensorflow::Status Convert(const string& graph_def_contents,
+tensorflow::Status Convert(const std::string& graph_def_contents,
                            const TocoFlags& toco_flags,
                            const ModelFlags& model_flags,
-                           string* output_file_contents,
+                           std::string* output_file_contents,
                            int64* arithmetic_ops_count = nullptr);
 
 tensorflow::Status Convert(const ParsedTocoFlags& parsed_toco_flags,
diff --git a/tensorflow/lite/toco/toco_convert_test.cc b/tensorflow/lite/toco/toco_convert_test.cc
index b02c1043f2b..bd6a8e79b45 100644
--- a/tensorflow/lite/toco/toco_convert_test.cc
+++ b/tensorflow/lite/toco/toco_convert_test.cc
@@ -32,8 +32,8 @@ TEST(TocoTest, BadInputFormat) {
   TocoFlags toco_flags;
   ModelFlags model_flags;
 
-  string input;
-  string output;
+  std::string input;
+  std::string output;
 
   EXPECT_DEATH(Convert(input, toco_flags, model_flags, &output).ok(),
                "Unhandled input_format='FILE_FORMAT_UNKNOWN'");
@@ -44,8 +44,8 @@ TEST(TocoTest, MissingOutputArrays) {
   ModelFlags model_flags;
 
   toco_flags.set_input_format(TENSORFLOW_GRAPHDEF);
-  string input;
-  string output;
+  std::string input;
+  std::string output;
 
   EXPECT_DEATH(Convert(input, toco_flags, model_flags, &output).ok(),
                "This model does not define output arrays, so a --output_arrays "
@@ -58,8 +58,8 @@ TEST(TocoTest, BadOutputArray) {
 
   toco_flags.set_input_format(TENSORFLOW_GRAPHDEF);
   model_flags.add_output_arrays("output1");
-  string input;
-  string output;
+  std::string input;
+  std::string output;
 
   EXPECT_DEATH(Convert(input, toco_flags, model_flags, &output).ok(),
                "Specified output array .output1. is not produced by any op "
@@ -72,7 +72,7 @@ TEST(TocoTest, BadOutputFormat) {
 
   toco_flags.set_input_format(TENSORFLOW_GRAPHDEF);
   model_flags.add_output_arrays("output1");
-  string input = R"GraphDef(
+  std::string input = R"GraphDef(
     node {
       name: "output1"
       input: "input1"
@@ -82,7 +82,7 @@ TEST(TocoTest, BadOutputFormat) {
     }
   )GraphDef";
 
-  string output;
+  std::string output;
 
   EXPECT_DEATH(Convert(input, toco_flags, model_flags, &output).ok(),
                "Unhandled output_format='FILE_FORMAT_UNKNOWN'");
@@ -97,7 +97,7 @@ TEST(TocoTest, SimpleFloatModel) {
 
   // Inputs are automatically selected (but that might not be a good idea).
   model_flags.add_output_arrays("output1");
-  string input = R"GraphDef(
+  std::string input = R"GraphDef(
     node {
       name: "input1"
       op: "Placeholder"
@@ -117,7 +117,7 @@ TEST(TocoTest, SimpleFloatModel) {
     }
   )GraphDef";
 
-  string output;
+  std::string output;
   EXPECT_TRUE(Convert(input, toco_flags, model_flags, &output).ok());
   EXPECT_TRUE(!output.empty());
 }
@@ -139,7 +139,7 @@ TEST(TocoTest, TransientStringTensors) {
   indices_1->set_name("indices1");
 
   model_flags.add_output_arrays("output1");
-  string input = R"GraphDef(
+  std::string input = R"GraphDef(
     node {
       name: "input1"
       op: "Placeholder"
@@ -169,7 +169,7 @@ TEST(TocoTest, TransientStringTensors) {
     }
   )GraphDef";
 
-  string output;
+  std::string output;
 
   EXPECT_TRUE(Convert(input, toco_flags, model_flags, &output).ok());
   EXPECT_TRUE(!output.empty());
diff --git a/tensorflow/lite/toco/toco_port.cc b/tensorflow/lite/toco/toco_port.cc
index d2f1d102c5a..8352e0fd9f2 100644
--- a/tensorflow/lite/toco/toco_port.cc
+++ b/tensorflow/lite/toco/toco_port.cc
@@ -28,7 +28,7 @@ double round(double x) { return ::round(x); }
 
 namespace toco {
 namespace port {
-void CopyToBuffer(const string& src, char* dest) {
+void CopyToBuffer(const std::string& src, char* dest) {
   memcpy(dest, src.data(), src.size());
 }
 
@@ -84,7 +84,7 @@ toco::port::file::Options ToOptions(const ::file::Options& options) {
   return Options();
 }
 
-tensorflow::Status Writable(const string& filename) {
+tensorflow::Status Writable(const std::string& filename) {
   File* f = nullptr;
   const auto status = ::file::Open(filename, "w", &f, ::file::Defaults());
   if (f) {
@@ -93,28 +93,30 @@ tensorflow::Status Writable(const string& filename) {
   return ToStatus(status);
 }
 
-tensorflow::Status Readable(const string& filename,
+tensorflow::Status Readable(const std::string& filename,
                             const file::Options& options) {
   return ToStatus(::file::Readable(filename, ::file::Defaults()));
 }
 
-tensorflow::Status Exists(const string& filename,
+tensorflow::Status Exists(const std::string& filename,
                           const file::Options& options) {
   auto status = ::file::Exists(filename, ::file::Defaults());
   return ToStatus(status);
 }
 
-tensorflow::Status GetContents(const string& filename, string* contents,
+tensorflow::Status GetContents(const std::string& filename,
+                               std::string* contents,
                                const file::Options& options) {
   return ToStatus(::file::GetContents(filename, contents, ::file::Defaults()));
 }
 
-tensorflow::Status SetContents(const string& filename, const string& contents,
+tensorflow::Status SetContents(const std::string& filename,
+                               const std::string& contents,
                                const file::Options& options) {
   return ToStatus(::file::SetContents(filename, contents, ::file::Defaults()));
 }
 
-string JoinPath(const string& a, const string& b) {
+std::string JoinPath(const std::string& a, const std::string& b) {
   return ::file::JoinPath(a, b);
 }
 
diff --git a/tensorflow/lite/toco/toco_port.h b/tensorflow/lite/toco/toco_port.h
index 5a80d29b72a..e57420fba4f 100644
--- a/tensorflow/lite/toco/toco_port.h
+++ b/tensorflow/lite/toco/toco_port.h
@@ -68,21 +68,23 @@ inline Options Defaults() {
   Options o;
   return o;
 }
-tensorflow::Status GetContents(const string& filename, string* contents,
+tensorflow::Status GetContents(const std::string& filename,
+                               std::string* contents, const Options& options);
+tensorflow::Status SetContents(const std::string& filename,
+                               const std::string& contents,
                                const Options& options);
-tensorflow::Status SetContents(const string& filename, const string& contents,
-                               const Options& options);
-string JoinPath(const string& base, const string& filename);
-tensorflow::Status Writable(const string& filename);
-tensorflow::Status Readable(const string& filename, const Options& options);
-tensorflow::Status Exists(const string& filename, const Options& options);
+std::string JoinPath(const std::string& base, const std::string& filename);
+tensorflow::Status Writable(const std::string& filename);
+tensorflow::Status Readable(const std::string& filename,
+                            const Options& options);
+tensorflow::Status Exists(const std::string& filename, const Options& options);
 }  // namespace file
 
 // Copy `src` string to `dest`. User must ensure `dest` has enough space.
 #if defined(PLATFORM_GOOGLE)
 void CopyToBuffer(const ::absl::Cord& src, char* dest);
 #endif  // PLATFORM_GOOGLE
-void CopyToBuffer(const string& src, char* dest);
+void CopyToBuffer(const std::string& src, char* dest);
 
 inline uint32 ReverseBits32(uint32 n) {
   n = ((n >> 1) & 0x55555555) | ((n & 0x55555555) << 1);
diff --git a/tensorflow/lite/toco/toco_tooling.cc b/tensorflow/lite/toco/toco_tooling.cc
index da0915f9739..25b48b54135 100644
--- a/tensorflow/lite/toco/toco_tooling.cc
+++ b/tensorflow/lite/toco/toco_tooling.cc
@@ -37,7 +37,7 @@ namespace toco {
 namespace {
 // CHECK-fails if the model contains a kUnsupported operation.
 void CheckUnsupportedOperations(const Model& model) {
-  std::set<string> unsupported_ops;
+  std::set<std::string> unsupported_ops;
   for (auto& op : model.operators) {
     if (op->type == OperatorType::kUnsupported) {
       unsupported_ops.insert(
@@ -172,7 +172,7 @@ void SetFinalDataTypeOnInputs(const TocoFlags& toco_flags, Model* model) {
   }
 
   for (int i = 0; i < model->flags.input_arrays_size(); i++) {
-    string const& array_name = model->flags.input_arrays(i).name();
+    std::string const& array_name = model->flags.input_arrays(i).name();
     auto* array = &model->GetArray(array_name);
     // Note that the notion of changing data types only applies to real-numbers
     // arrays (see the documentation for inference_input_type).
@@ -209,7 +209,7 @@ void SetFinalDataTypeOnInputs(const TocoFlags& toco_flags, Model* model) {
 
 std::unique_ptr<Model> Import(const TocoFlags& toco_flags,
                               const ModelFlags& model_flags,
-                              const string& input_file_contents) {
+                              const std::string& input_file_contents) {
   std::unique_ptr<Model> model;
   switch (toco_flags.input_format()) {
     case TENSORFLOW_GRAPHDEF: {
@@ -473,7 +473,8 @@ tensorflow::Status TransformWithStatus(const TocoFlags& toco_flags,
 }
 
 tensorflow::Status Export(const TocoFlags& toco_flags, const Model& model,
-                          bool allow_custom_ops, string* output_file_contents) {
+                          bool allow_custom_ops,
+                          std::string* output_file_contents) {
   switch (toco_flags.output_format()) {
     case TENSORFLOW_GRAPHDEF:
       ExportTensorFlowGraphDef(model, output_file_contents);
diff --git a/tensorflow/lite/toco/toco_tooling.h b/tensorflow/lite/toco/toco_tooling.h
index 36996151949..581df4b14fd 100644
--- a/tensorflow/lite/toco/toco_tooling.h
+++ b/tensorflow/lite/toco/toco_tooling.h
@@ -27,7 +27,7 @@ namespace toco {
 // Imports the input file into a Model object.
 std::unique_ptr<Model> Import(const TocoFlags& toco_flags,
                               const ModelFlags& model_flags,
-                              const string& input_file_contents);
+                              const std::string& input_file_contents);
 
 // Transforms a Model. The resulting Model is ready to be passed
 // to Export with the exact same toco_flags.
@@ -42,11 +42,12 @@ inline void Transform(const TocoFlags& toco_flags, Model* model) {
 // Transform, to a file of the format given by
 // toco_flags.output_format().
 tensorflow::Status Export(const TocoFlags& toco_flags, const Model& model,
-                          bool allow_custom_ops, string* output_file_contents);
+                          bool allow_custom_ops,
+                          std::string* output_file_contents);
 
 // This if for backward-compatibility with internal tools.
 inline void Export(const TocoFlags& toco_flags, const Model& model,
-                   string* output_file_contents) {
+                   std::string* output_file_contents) {
   auto status = Export(toco_flags, model, true, output_file_contents);
   if (!status.ok()) {
     LOG(QFATAL) << status.error_message();
diff --git a/tensorflow/lite/toco/tooling_util.cc b/tensorflow/lite/toco/tooling_util.cc
index 82ef4445a84..be4cda8aa3d 100644
--- a/tensorflow/lite/toco/tooling_util.cc
+++ b/tensorflow/lite/toco/tooling_util.cc
@@ -53,8 +53,8 @@ absl::string_view FindLongestCommonPrefix(absl::string_view a,
   return absl::string_view(a.data(), count);
 }
 
-string LogName(const Operator& op) {
-  const string& opname = HelpfulOperatorTypeName(op);
+std::string LogName(const Operator& op) {
+  const std::string& opname = HelpfulOperatorTypeName(op);
   if (op.outputs.empty()) {
     return toco::port::StringF("{%s operator}", opname);
   } else {
@@ -63,7 +63,7 @@ string LogName(const Operator& op) {
   }
 }
 
-string ArrayDataTypeName(ArrayDataType data_type) {
+std::string ArrayDataTypeName(ArrayDataType data_type) {
   switch (data_type) {
     case ArrayDataType::kFloat:
       return "float";
@@ -96,7 +96,7 @@ string ArrayDataTypeName(ArrayDataType data_type) {
   }
 }
 
-bool IsInputArray(const Model& model, const string& array_name) {
+bool IsInputArray(const Model& model, const std::string& array_name) {
   for (const auto& input_array : model.flags.input_arrays()) {
     if (array_name == input_array.name()) {
       return true;
@@ -105,7 +105,7 @@ bool IsInputArray(const Model& model, const string& array_name) {
   return false;
 }
 
-bool IsOutputArray(const Model& model, const string& array_name) {
+bool IsOutputArray(const Model& model, const std::string& array_name) {
   for (const auto& output_array : model.flags.output_arrays()) {
     if (array_name == output_array) {
       return true;
@@ -114,7 +114,7 @@ bool IsOutputArray(const Model& model, const string& array_name) {
   return false;
 }
 
-bool IsArrayConsumed(const Model& model, const string& name) {
+bool IsArrayConsumed(const Model& model, const std::string& name) {
   if (GetOpWithInput(model, name)) {
     return true;
   }
@@ -131,7 +131,7 @@ bool IsArrayConsumed(const Model& model, const string& name) {
 
 int CountTrueOutputs(const Model& model, const Operator& op) {
   int count = 0;
-  for (const string& output : op.outputs) {
+  for (const std::string& output : op.outputs) {
     if (IsArrayConsumed(model, output)) {
       ++count;
     }
@@ -139,7 +139,7 @@ int CountTrueOutputs(const Model& model, const Operator& op) {
   return count;
 }
 
-int CountOpsWithInput(const Model& model, const string& array_name) {
+int CountOpsWithInput(const Model& model, const std::string& array_name) {
   int count = 0;
   for (const auto& op : model.operators) {
     for (auto& input : op->inputs) {
@@ -155,7 +155,7 @@ int CountOpsWithInput(const Model& model, const string& array_name) {
   return count;
 }
 
-bool DeleteArrayIfUnused(const string& array_name, Model* model) {
+bool DeleteArrayIfUnused(const std::string& array_name, Model* model) {
   if (IsDiscardableArray(*model, array_name) &&
       CountOpsWithInput(*model, array_name) == 0 &&
       GetOpWithOutput(*model, array_name) == nullptr) {
@@ -165,7 +165,7 @@ bool DeleteArrayIfUnused(const string& array_name, Model* model) {
   return false;
 }
 
-bool DeleteArrayIfUnusedOutsideOfOp(const string& array_name,
+bool DeleteArrayIfUnusedOutsideOfOp(const std::string& array_name,
                                     const Operator* op, Model* model) {
   if (!IsDiscardableArray(*model, array_name)) {
     return false;
@@ -187,10 +187,10 @@ bool DeleteArrayIfUnusedOutsideOfOp(const string& array_name,
 }
 
 void DeleteOpAndArrays(Model* model, const Operator* op) {
-  for (const string& array_name : op->inputs) {
+  for (const std::string& array_name : op->inputs) {
     DeleteArrayIfUnusedOutsideOfOp(array_name, op, model);
   }
-  for (const string& array_name : op->outputs) {
+  for (const std::string& array_name : op->outputs) {
     DeleteArrayIfUnusedOutsideOfOp(array_name, op, model);
   }
   auto op_it = FindOp(*model, op);
@@ -199,7 +199,7 @@ void DeleteOpAndArrays(Model* model, const Operator* op) {
 }
 
 std::vector<std::unique_ptr<Operator>>::const_iterator FindOpWithOutput(
-    const Model& model, const string& array_name) {
+    const Model& model, const std::string& array_name) {
   for (auto it = model.operators.begin(); it != model.operators.end(); ++it) {
     for (auto& output : it->get()->outputs) {
       if (output == array_name) {
@@ -211,7 +211,7 @@ std::vector<std::unique_ptr<Operator>>::const_iterator FindOpWithOutput(
 }
 
 std::vector<std::unique_ptr<Operator>>::iterator FindOpWithOutput(
-    Model& model, const string& array_name) {
+    Model& model, const std::string& array_name) {
   for (auto it = model.operators.begin(); it != model.operators.end(); ++it) {
     for (auto& output : it->get()->outputs) {
       if (output == array_name) {
@@ -222,14 +222,14 @@ std::vector<std::unique_ptr<Operator>>::iterator FindOpWithOutput(
   return model.operators.end();
 }
 
-Operator* GetOpWithOutput(const Model& model, const string& array_name) {
+Operator* GetOpWithOutput(const Model& model, const std::string& array_name) {
   auto it = FindOpWithOutput(model, array_name);
   return it == model.operators.end() ? nullptr : it->get();
 }
 
 // GetFirstOpWithInput assumes that this finds the first op.
 std::vector<std::unique_ptr<Operator>>::const_iterator FindOpWithInput(
-    const Model& model, const string& array_name) {
+    const Model& model, const std::string& array_name) {
   for (auto it = model.operators.begin(); it != model.operators.end(); ++it) {
     for (auto& input : it->get()->inputs) {
       if (input == array_name) {
@@ -241,7 +241,7 @@ std::vector<std::unique_ptr<Operator>>::const_iterator FindOpWithInput(
 }
 
 std::vector<std::unique_ptr<Operator>>::iterator FindOpWithInput(
-    Model& model, const string& array_name) {
+    Model& model, const std::string& array_name) {
   for (auto it = model.operators.begin(); it != model.operators.end(); ++it) {
     for (auto& input : it->get()->inputs) {
       if (input == array_name) {
@@ -272,18 +272,19 @@ std::vector<std::unique_ptr<Operator>>::iterator FindOp(Model& model,
   return model.operators.end();
 }
 
-Operator* GetOpWithInput(const Model& model, const string& array_name) {
+Operator* GetOpWithInput(const Model& model, const std::string& array_name) {
   auto it = FindOpWithInput(model, array_name);
   return it == model.operators.end() ? nullptr : it->get();
 }
 
-Operator* GetFirstOpWithInput(const Model& model, const string& array_name) {
+Operator* GetFirstOpWithInput(const Model& model,
+                              const std::string& array_name) {
   auto it = FindOpWithInput(model, array_name);
   return it == model.operators.end() ? nullptr : it->get();
 }
 
-void ReplaceArrayUsage(Model* model, const string& old_array_name,
-                       const string& new_array_name) {
+void ReplaceArrayUsage(Model* model, const std::string& old_array_name,
+                       const std::string& new_array_name) {
   for (auto& op_it : model->operators) {
     Operator* op = op_it.get();
     for (size_t i = 0; i < op->inputs.size(); ++i) {
@@ -299,11 +300,12 @@ void ReplaceArrayUsage(Model* model, const string& old_array_name,
   }
 }
 
-string FormatArraysList(const Model& model, const std::vector<string>& list) {
+std::string FormatArraysList(const Model& model,
+                             const std::vector<std::string>& list) {
   if (list.empty()) {
     return "[]";
   }
-  string result = "";
+  std::string result = "";
   if (list.size() > 1) {
     result += "[ ";
   }
@@ -459,7 +461,7 @@ const char* OperatorTypeName(OperatorType type) {
   }
 }
 
-string HelpfulOperatorTypeName(const Operator& op) {
+std::string HelpfulOperatorTypeName(const Operator& op) {
   if (op.type == OperatorType::kUnsupported) {
     return toco::port::StringF(
         "(Unsupported TensorFlow op: %s)",
@@ -503,7 +505,7 @@ void LogSummary(int log_level, const Model& model) {
   }
 }
 
-void LogArray(int log_level, const Model& model, const string& name) {
+void LogArray(int log_level, const Model& model, const std::string& name) {
   VLOG(log_level) << "Array: " << name;
   if (!model.HasArray(name)) {
     VLOG(log_level) << "  DOES NOT EXIST";
@@ -524,7 +526,7 @@ void LogArray(int log_level, const Model& model, const string& name) {
     if (array_shape.dimensions_count() == 0) {
       VLOG(log_level) << "  (Zero dimensions)";
     } else {
-      string message = "  Dims: ";
+      std::string message = "  Dims: ";
       bool first = true;
       for (const int dim : array_shape.dims()) {
         if (!first) {
@@ -568,10 +570,10 @@ void DumpGraphvizVideoFrame(const Model& model) {
   // this new video-dumping feature.
   static int dump_id = 0;
   static std::unordered_set<std::size_t> dump_hashes;
-  string graphviz_dump;
+  std::string graphviz_dump;
   DumpGraphviz(model, &graphviz_dump,
                toco::port::StringF("VIDEO frame:%05d", dump_id));
-  std::size_t hash = std::hash<string>{}(graphviz_dump);
+  std::size_t hash = std::hash<std::string>{}(graphviz_dump);
   if (!dump_hashes.count(hash)) {
     LOG(INFO) << "DUMPING GRAPHVIZ VIDEO FRAME: " << dump_id;
     dump_hashes.insert(hash);
@@ -585,13 +587,13 @@ void DumpGraphvizVideoFrame(const Model& model) {
   }
 }
 
-void LogDump(int log_level, const string& message, const Model& model) {
+void LogDump(int log_level, const std::string& message, const Model& model) {
   namespace port = toco::port;
   const auto& dump_options = *GraphVizDumpOptions::singleton();
 
   DumpGraphvizVideoFrame(model);
   if (!dump_options.dump_graphviz.empty()) {
-    string graphviz_dump;
+    std::string graphviz_dump;
 
     DumpGraphviz(model, &graphviz_dump, message);
     const auto result = port::file::SetContents(
@@ -608,7 +610,7 @@ void LogDump(int log_level, const string& message, const Model& model) {
   }
   VLOG(log_level) << "BEGIN DUMP OF TOCO MODEL (" << message << ")";
   LogSummary(log_level, model);
-  std::unordered_set<string> already_printed_arrays;
+  std::unordered_set<std::string> already_printed_arrays;
   for (const auto& op : model.operators) {
     for (const auto& input : op->inputs) {
       if (!already_printed_arrays.count(input)) {
@@ -759,7 +761,7 @@ int RequiredBufferSizeForShape(const Shape& shape) {
   return max_offset;
 }
 
-bool IsConstantParameterArray(const Model& model, const string& name) {
+bool IsConstantParameterArray(const Model& model, const std::string& name) {
   if (!model.HasArray(name)) {
     return false;
   }
@@ -858,7 +860,7 @@ bool CompareConstantArrays(const Array& lhs_array, const Array& rhs_array) {
 namespace {
 // Take an array name, which may be something like "name:3_5" and make it
 // acceptable as a TF node name, say "name_3_5";
-string SanitizeNameForTFNode(const string& array_name) {
+std::string SanitizeNameForTFNode(const std::string& array_name) {
   auto node_name = array_name;
   std::replace(node_name.begin(), node_name.end(), ':', '_');
   return node_name;
@@ -866,7 +868,7 @@ string SanitizeNameForTFNode(const string& array_name) {
 
 void CheckInputArraysAreNotOutputArrays(const ModelFlags& model_flags) {
   for (const auto& input_array : model_flags.input_arrays()) {
-    for (const string& output_array : model_flags.output_arrays()) {
+    for (const std::string& output_array : model_flags.output_arrays()) {
       QCHECK_NE(input_array.name(), output_array)
           << "The array " << output_array
           << " is listed in both --input_arrays and --output_arrays.";
@@ -874,7 +876,7 @@ void CheckInputArraysAreNotOutputArrays(const ModelFlags& model_flags) {
   }
 }
 
-bool IsAsciiPrintable(const string& name) {
+bool IsAsciiPrintable(const std::string& name) {
   for (char c : name) {
     if (!absl::ascii_isprint(c)) {
       return false;
@@ -883,8 +885,8 @@ bool IsAsciiPrintable(const string& name) {
   return true;
 }
 
-string DumpAscii(const string& name) {
-  string result;
+std::string DumpAscii(const std::string& name) {
+  std::string result;
   port::AppendF(&result, "ASCII | Hex\n");
   port::AppendF(&result, "------+----\n");
   for (char c : name) {
@@ -909,7 +911,7 @@ void CheckNonAsciiIOArrays(const ModelFlags& model_flags) {
         << "Here is a dump of the string:\n\n"
         << DumpAscii(input_array.name());
   }
-  for (const string& output_array : model_flags.output_arrays()) {
+  for (const std::string& output_array : model_flags.output_arrays()) {
     QCHECK(IsAsciiPrintable(output_array))
         << "Non-ASCII-printable character found in --output_arrays: "
         << output_array << ". Pass --allow_nonascii_arrays to allow that. "
@@ -932,7 +934,7 @@ void CheckNonExistentIOArrays(const Model& model) {
       "Is it a typo? This should not happen. If you trigger this error "
       "please send a bug report (with code to reproduce this error), to the "
       "TensorFlow Lite team.";
-  for (const string& output_array : model.flags.output_arrays()) {
+  for (const std::string& output_array : model.flags.output_arrays()) {
     if (IsConstantParameterArray(model, output_array)) {
       continue;  // It is OK to request that a constant be an output.
     }
@@ -984,7 +986,7 @@ void FixNoMissingArray(Model* model) {
     }
   }
   if (model->flags.allow_nonexistent_arrays()) {
-    for (const string& output_array : model->flags.output_arrays()) {
+    for (const std::string& output_array : model->flags.output_arrays()) {
       model->GetOrCreateArray(output_array);
     }
     for (const auto& rnn_state : model->flags.rnn_states()) {
@@ -995,7 +997,7 @@ void FixNoMissingArray(Model* model) {
 }
 
 void CheckNoOrphanedArray(const Model& model) {
-  std::unordered_set<string> arrays_without_known_use;
+  std::unordered_set<std::string> arrays_without_known_use;
   for (const auto& array : model.GetArrayMap()) {
     if (IsDiscardableArray(model, array.first)) {
       arrays_without_known_use.insert(array.first);
@@ -1022,7 +1024,7 @@ void CheckNoOrphanedArray(const Model& model) {
 }
 
 void FixNoOrphanedArray(Model* model) {
-  std::unordered_set<string> arrays_without_known_use;
+  std::unordered_set<std::string> arrays_without_known_use;
   for (const auto& array : model->GetArrayMap()) {
     arrays_without_known_use.insert(array.first);
   }
@@ -1071,11 +1073,11 @@ void CheckEachArray(const Model& model) {
 
     // Check name.  Either "name_with_suffix_8", "name_with_port:3", but not
     // "name_with_both:3_8".
-    const string& name = array_entry.first;
+    const std::string& name = array_entry.first;
     auto colon_pos = name.find_first_of(":");
-    if (colon_pos != string::npos) {
+    if (colon_pos != std::string::npos) {
       CHECK_EQ(name.substr(colon_pos + 1).find_first_not_of("0123456789"),
-               string::npos)
+               std::string::npos)
           << "Array '" << name << "' has non-digit characters after colon.";
     }
     CHECK_GT(colon_pos, 0) << "Array '" << name
@@ -1084,7 +1086,7 @@ void CheckEachArray(const Model& model) {
 }
 
 void CheckOperatorOrdering(const Model& model) {
-  std::unordered_set<string> arrays_behind_us;
+  std::unordered_set<std::string> arrays_behind_us;
   for (const auto& array_entry : model.GetArrayMap()) {
     if (!GetOpWithOutput(model, array_entry.first)) {
       arrays_behind_us.insert(array_entry.first);
@@ -1103,13 +1105,13 @@ void CheckOperatorOrdering(const Model& model) {
       arrays_behind_us.insert(output);
     }
   }
-  for (const string& output_array : model.flags.output_arrays()) {
+  for (const std::string& output_array : model.flags.output_arrays()) {
     CHECK(arrays_behind_us.count(output_array));
   }
 }
 
 void FixOperatorOrdering(Model* model) {
-  std::unordered_set<string> arrays_behind_us;
+  std::unordered_set<std::string> arrays_behind_us;
   for (const auto& array_entry : model->GetArrayMap()) {
     if (!GetOpWithOutput(*model, array_entry.first)) {
       arrays_behind_us.insert(array_entry.first);
@@ -1123,7 +1125,7 @@ void FixOperatorOrdering(Model* model) {
   for (std::size_t i = 0; i < old_operators.size(); i++) {
     remaining.insert(i);
   }
-  std::unordered_map<string, string> reason_why_leftover;
+  std::unordered_map<std::string, std::string> reason_why_leftover;
   while (true) {
     bool inserted_something = false;
     for (const auto& i : remaining) {
@@ -1133,7 +1135,7 @@ void FixOperatorOrdering(Model* model) {
       for (const auto& input : op->inputs) {
         if (!IsConstantParameterArray(*model, input) &&
             !arrays_behind_us.count(input)) {
-          for (const string& output : op->outputs) {
+          for (const std::string& output : op->outputs) {
             reason_why_leftover[output] = input;
           }
           can_insert = false;
@@ -1166,15 +1168,15 @@ void FixOperatorOrdering(Model* model) {
     LOG(ERROR) << "BEGIN TRACE OF OPERATOR WITH BAD INPUT";
     LOG(ERROR) << "Here is the first-encountered operator with a bad input: ";
     const Operator* bad_op = old_operators[*remaining.begin()].get();
-    std::unordered_set<string> bad_inputs_already_traced;
+    std::unordered_set<std::string> bad_inputs_already_traced;
     // The following while(true) loop should always end with a LOG(FATAL).
     while (true) {
       LOG(ERROR) << HelpfulOperatorTypeName(*bad_op) << " : "
                  << FormatArraysList(*model, bad_op->inputs) << " -> "
                  << FormatArraysList(*model, bad_op->outputs);
       bool found_bad_output = false;
-      string bad_output;
-      for (const string& output : bad_op->outputs) {
+      std::string bad_output;
+      for (const std::string& output : bad_op->outputs) {
         if (reason_why_leftover.count(output)) {
           found_bad_output = true;
           bad_output = output;
@@ -1182,7 +1184,7 @@ void FixOperatorOrdering(Model* model) {
         }
       }
       CHECK(found_bad_output);
-      const string& bad_input = reason_why_leftover[bad_output];
+      const std::string& bad_input = reason_why_leftover[bad_output];
       LOG(ERROR) << "The bad input here is: " << bad_input;
       if (bad_inputs_already_traced.count(bad_input)) {
         LOG(FATAL)
@@ -1198,7 +1200,7 @@ void FixOperatorOrdering(Model* model) {
       bad_op = nullptr;
       for (const auto& i : remaining) {
         const Operator* op = old_operators[i].get();
-        for (const string& output : op->outputs) {
+        for (const std::string& output : op->outputs) {
           if (bad_input == output) {
             bad_op = op;
             break;
@@ -1233,7 +1235,7 @@ void CheckInvariants(const Model& model) {
 }
 
 void CheckCountInRange(const ::toco::ModelFlags::ModelCheck& model_check,
-                       const int count, const string& count_description) {
+                       const int count, const std::string& count_description) {
   if (model_check.count_min() >= 0) {
     CHECK_GE(count, model_check.count_min())
         << "Mismatch in " << count_description << ": count  was " << count
@@ -1251,7 +1253,7 @@ void CheckCountInRange(const ::toco::ModelFlags::ModelCheck& model_check,
 
 void CheckModelCounts(const Model& model) {
   std::unordered_multiset<OperatorType> ops_by_type;
-  std::unordered_map<string, OperatorType> op_type_by_name;
+  std::unordered_map<std::string, OperatorType> op_type_by_name;
   if (model.flags.model_checks_size() == 0) {
     return;
   }
@@ -1261,7 +1263,7 @@ void CheckModelCounts(const Model& model) {
     op_type_by_name[OperatorTypeName(op->type)] = op->type;
   }
   for (const auto& model_check : model.flags.model_checks()) {
-    string count_type = model_check.count_type();
+    std::string count_type = model_check.count_type();
     if (count_type == "None") {
       continue;
     } else if (count_type == "Arrays") {
@@ -1284,12 +1286,12 @@ void CheckModelCounts(const Model& model) {
 }
 
 void FixEdgeArrays(Model* model) {
-  for (const string& output_array_name : model->flags.output_arrays()) {
+  for (const std::string& output_array_name : model->flags.output_arrays()) {
     if (!GetOpWithOutput(*model, output_array_name)) {
       // Output has no operator producing it. Change that by inserting a copy.
       LOG(WARNING) << "Fixing constant output array " << output_array_name
                    << " by inserting a copy. This is not optimal.";
-      string intermediate_array_name =
+      std::string intermediate_array_name =
           AvailableArrayName(*model, output_array_name + "_copy");
       CloneArray(model, output_array_name, intermediate_array_name);
       InsertCopyOperator(model, intermediate_array_name, output_array_name);
@@ -1378,8 +1380,8 @@ void CopyArrayAttribs(const Array& source_array, Array* target_array) {
 }
 }  // namespace
 
-void InsertCopyOperator(Model* model, const string& source_array_name,
-                        const string& target_array_name) {
+void InsertCopyOperator(Model* model, const std::string& source_array_name,
+                        const std::string& target_array_name) {
   // Reshape to the same size. This should be a no-op.
   const Array& source_array = model->GetArray(source_array_name);
   std::vector<int> shape = source_array.shape().dims();
@@ -1404,8 +1406,8 @@ void InsertCopyOperator(Model* model, const string& source_array_name,
   model->operators.emplace_back(copy_op);
 }
 
-void CloneArray(Model* model, const string& source_array_name,
-                const string& target_array_name) {
+void CloneArray(Model* model, const std::string& source_array_name,
+                const std::string& target_array_name) {
   CHECK(!model->HasArray(target_array_name));
   const Array& source_array = model->GetArray(source_array_name);
   Array& target_array = model->GetOrCreateArray(target_array_name);
@@ -1479,7 +1481,7 @@ void MakeArrayDims(int num_dims, int batch, int height, int width, int depth,
   }
 }
 
-void CreateOrCheckRnnStateArray(const string& name, int size,
+void CreateOrCheckRnnStateArray(const std::string& name, int size,
                                 int state_num_dims, Model* model) {
   int batch = 1;
   int num_dims = -1;
@@ -1781,7 +1783,7 @@ int ElementSize(ArrayDataType data_type) {
   }
 }
 
-void DropMinMax(Model* model, const string& array_name) {
+void DropMinMax(Model* model, const std::string& array_name) {
   auto& array = model->GetArray(array_name);
   if (!!array.minmax) {
     LOG(WARNING) << "Dropping MinMax information in array " << array_name
@@ -1790,7 +1792,8 @@ void DropMinMax(Model* model, const string& array_name) {
   }
 }
 
-bool IsAllocatableTransientArray(const Model& model, const string& array_name) {
+bool IsAllocatableTransientArray(const Model& model,
+                                 const std::string& array_name) {
   // Optional array is not transient
   if (model.IsOptionalArray(array_name)) return false;
   // The model's input and output arrays are externally allocated.
@@ -1818,15 +1821,15 @@ bool IsAllocatableTransientArray(const Model& model, const string& array_name) {
   return true;
 }
 
-string AvailableArrayName(const Model& model, const string& name) {
-  string sanitized_name = SanitizeNameForTFNode(name);
+std::string AvailableArrayName(const Model& model, const std::string& name) {
+  std::string sanitized_name = SanitizeNameForTFNode(name);
   if (!model.HasArray(sanitized_name) &&
       !model.IsOptionalArray(sanitized_name)) {
     return sanitized_name;
   }
   const int kNumSuffixesToTry = 1000;
   for (int i = 0; i < kNumSuffixesToTry; i++) {
-    const string& name_with_suffix =
+    const std::string& name_with_suffix =
         toco::port::StringF("%s_%d", sanitized_name, i);
     if (!model.HasArray(name_with_suffix) &&
         !model.IsOptionalArray(name_with_suffix)) {
@@ -1839,7 +1842,7 @@ string AvailableArrayName(const Model& model, const string& name) {
   return "";
 }
 
-string ShapeToString(const Shape& shape) {
+std::string ShapeToString(const Shape& shape) {
   if (shape.dimensions_count() == 0) {
     return "[]";
   }
@@ -1847,7 +1850,7 @@ string ShapeToString(const Shape& shape) {
   return absl::StrCat("[ ", absl::StrJoin(shape.dims(), ", "), " ]");
 }
 
-void PrintArrayShape(Model* model, const string& name) {
+void PrintArrayShape(Model* model, const std::string& name) {
   if (!model->GetArray(name).has_shape()) {
     LOG(INFO) << name << " has no shape";
     return;
@@ -1856,7 +1859,7 @@ void PrintArrayShape(Model* model, const string& name) {
             << " has shape: " << ShapeToString(model->GetArray(name).shape());
 }
 
-bool IsArrayFullyConnectedWeights(const Model& model, const string& name) {
+bool IsArrayFullyConnectedWeights(const Model& model, const std::string& name) {
   bool is_fc_weights = false;
   bool is_something_else = false;
   for (const auto& op : model.operators) {
@@ -1874,8 +1877,8 @@ bool IsArrayFullyConnectedWeights(const Model& model, const string& name) {
   return is_fc_weights;
 }
 
-string CreateInt32Array(Model* model, const string& param_name,
-                        const std::vector<int>& value) {
+std::string CreateInt32Array(Model* model, const std::string& param_name,
+                             const std::vector<int>& value) {
   auto param_array_name = AvailableArrayName(*model, param_name);
   auto& param_array = model->GetOrCreateArray(param_array_name);
   param_array.mutable_shape()->ReplaceDims({static_cast<int>(value.size())});
@@ -2031,7 +2034,7 @@ bool EstimateArithmeticOpsCount(const Model& model, int64* result) {
   return true;
 }
 
-string FormattedNumber(int64 x) {
+std::string FormattedNumber(int64 x) {
   const int64 million = 1000000;
   const int64 billion = 1000000000;
   if (x < 10000) {
@@ -2222,7 +2225,7 @@ int AxesCount(AxesOrder axes_order) {
   }
 }
 
-bool IsDiscardableArray(const Model& model, const string& array_name) {
+bool IsDiscardableArray(const Model& model, const std::string& array_name) {
   if (IsInputArray(model, array_name) || IsOutputArray(model, array_name)) {
     return false;
   }
@@ -2338,9 +2341,9 @@ void FinishBuildingRNNStates(Model* model) {
 
 // Returns the array names that match the ArraysExtraInfo's name and
 // name_regexp. The regexp match is for a full match.
-std::unordered_set<string> ScanArrayNames(
+std::unordered_set<std::string> ScanArrayNames(
     const Model& model, const toco::ArraysExtraInfo_Entry& entry) {
-  std::unordered_set<string> matches;
+  std::unordered_set<std::string> matches;
   if (model.HasArray(entry.name())) {
     matches.insert(entry.name());
   }
@@ -2409,7 +2412,7 @@ void UndoWeightsShuffling(Model* model) {
     if (fc_op.weights_format == FullyConnectedWeightsFormat::kDefault) {
       continue;
     }
-    const string& weights_name = fc_op.inputs[1];
+    const std::string& weights_name = fc_op.inputs[1];
     QCHECK_EQ(CountOpsWithInput(*model, weights_name), 1);
     auto& weights_array = model->GetArray(weights_name);
     QCHECK(weights_array.data_type == ArrayDataType::kUint8);
diff --git a/tensorflow/lite/toco/tooling_util.h b/tensorflow/lite/toco/tooling_util.h
index 6fd13be182c..438ce19970d 100644
--- a/tensorflow/lite/toco/tooling_util.h
+++ b/tensorflow/lite/toco/tooling_util.h
@@ -54,44 +54,45 @@ constexpr int kLogLevelModelUnchanged = 2;
 
 absl::string_view FindLongestCommonPrefix(absl::string_view a,
                                           absl::string_view b);
-string LogName(const Operator& op);
+std::string LogName(const Operator& op);
 
-string ArrayDataTypeName(ArrayDataType data_type);
+std::string ArrayDataTypeName(ArrayDataType data_type);
 
 // Returns true if the given array is specified as a model input array.
-bool IsInputArray(const Model& model, const string& array_name);
+bool IsInputArray(const Model& model, const std::string& array_name);
 // Returns true if the given array is specified as a model output array.
-bool IsOutputArray(const Model& model, const string& array_name);
+bool IsOutputArray(const Model& model, const std::string& array_name);
 
-bool IsArrayConsumed(const Model& model, const string& name);
+bool IsArrayConsumed(const Model& model, const std::string& name);
 int CountTrueOutputs(const Model& model, const Operator& op);
 
-int CountOpsWithInput(const Model& model, const string& array_name);
-bool DeleteArrayIfUnused(const string& array_name, Model* model);
+int CountOpsWithInput(const Model& model, const std::string& array_name);
+bool DeleteArrayIfUnused(const std::string& array_name, Model* model);
 
 // Deletes the op and any of its input and output arrays if they are unused
 // after the op has been deleted.
 void DeleteOpAndArrays(Model* model, const Operator* op);
 
 std::vector<std::unique_ptr<Operator>>::const_iterator FindOpWithOutput(
-    const Model& model, const string& array_name);
-Operator* GetOpWithOutput(const Model& model, const string& array_name);
+    const Model& model, const std::string& array_name);
+Operator* GetOpWithOutput(const Model& model, const std::string& array_name);
 
 std::vector<std::unique_ptr<Operator>>::iterator FindOpWithOutput(
-    Model& model, const string& array_name);
+    Model& model, const std::string& array_name);
 
 std::vector<std::unique_ptr<Operator>>::const_iterator FindOpWithInput(
-    const Model& model, const string& array_name);
+    const Model& model, const std::string& array_name);
 
 std::vector<std::unique_ptr<Operator>>::iterator FindOpWithInput(
-    Model& model, const string& array_name);
+    Model& model, const std::string& array_name);
 
-Operator* GetOpWithInput(const Model& model, const string& array_name);
-Operator* GetFirstOpWithInput(const Model& model, const string& array_name);
+Operator* GetOpWithInput(const Model& model, const std::string& array_name);
+Operator* GetFirstOpWithInput(const Model& model,
+                              const std::string& array_name);
 
 // Replaces all uses of the |old_array_name| with the |new_array_name|.
-void ReplaceArrayUsage(Model* model, const string& old_array_name,
-                       const string& new_array_name);
+void ReplaceArrayUsage(Model* model, const std::string& old_array_name,
+                       const std::string& new_array_name);
 
 std::vector<std::unique_ptr<Operator>>::const_iterator FindOp(
     const Model& model, const Operator* op);
@@ -99,15 +100,15 @@ std::vector<std::unique_ptr<Operator>>::iterator FindOp(Model& model,
                                                         const Operator* op);
 
 const char* OperatorTypeName(OperatorType type);
-string HelpfulOperatorTypeName(const Operator& op);
+std::string HelpfulOperatorTypeName(const Operator& op);
 
 // Whether the operator can be fused with an activation function. Note that this
 // will return false by default for new operators; fusing support is opt-in.
 bool OperatorSupportsFusedActivation(OperatorType type);
 
 void DumpGraphvizVideoFrame(const Model& model);
-void LogDump(int log_level, const string& message, const Model& model);
-void LogSummary(int log_level, const string& message, const Model& model);
+void LogDump(int log_level, const std::string& message, const Model& model);
+void LogSummary(int log_level, const std::string& message, const Model& model);
 
 // TODO(b/36075966): Clean up when dims superseded by array shape.
 void ExtendShape(Shape* shape, int new_shape_size);
@@ -143,12 +144,12 @@ inline ::tflite::RuntimeShape ToRuntimeShape(const Shape& shape) {
   return ::tflite::RuntimeShape(shape.dimensions_count(), shape.dims().data());
 }
 
-bool IsArrayFullyConnectedWeights(const Model& model, const string& name);
+bool IsArrayFullyConnectedWeights(const Model& model, const std::string& name);
 
 // If there is a wildcard dimension (-1), this may return a negative value.
 int RequiredBufferSizeForShape(const Shape& shape);
 
-bool IsConstantParameterArray(const Model& model, const string& name);
+bool IsConstantParameterArray(const Model& model, const std::string& name);
 
 // Compares two constant parameter arrays for exact equality.
 bool CompareConstantArrays(const Array& lhs_array, const Array& rhs_array);
@@ -193,12 +194,12 @@ void CopyArrayBuffer(const Array& source_array, Array* target_array) {
 
 // Inserts a no-op reshape operator between the source array and the target
 // array. This effectively just copies the data.
-void InsertCopyOperator(Model* model, const string& source_array_name,
-                        const string& target_array_name);
+void InsertCopyOperator(Model* model, const std::string& source_array_name,
+                        const std::string& target_array_name);
 
 // Clones an array with all data and parameters.
-void CloneArray(Model* model, const string& source_array_name,
-                const string& target_array_name);
+void CloneArray(Model* model, const std::string& source_array_name,
+                const std::string& target_array_name);
 
 void ResolveModelFlags(const ModelFlags& model_flags, Model* model);
 
@@ -245,32 +246,33 @@ inline std::vector<int> ReverseOffset(const Shape& shape, int index) {
 
 int ElementSize(ArrayDataType data_type);
 
-void DropMinMax(Model* model, const string& array_name);
+void DropMinMax(Model* model, const std::string& array_name);
 
-bool IsAllocatableTransientArray(const Model& model, const string& array_name);
+bool IsAllocatableTransientArray(const Model& model,
+                                 const std::string& array_name);
 
-void CreateOrCheckRnnStateArray(const string& name, int size,
+void CreateOrCheckRnnStateArray(const std::string& name, int size,
                                 int state_num_dims, Model* model);
 
-string AvailableArrayName(const Model& model, const string& name);
+std::string AvailableArrayName(const Model& model, const std::string& name);
 
 // Formats a shape as a string: [ dims(0), dims(1), ..., dims(num_dims-1) ].
-string ShapeToString(const Shape& shape);
+std::string ShapeToString(const Shape& shape);
 
-void PrintArrayShape(Model* model, const string& name);
+void PrintArrayShape(Model* model, const std::string& name);
 
 void MakeArrayDims(int num_dims, int batch, int height, int width, int depth,
                    std::vector<int>* out_dims);
 
 // Defines a constant int32 array with the provided values formatted for use
 // as op parameters.
-string CreateInt32Array(Model* model, const string& param_name,
-                        const std::vector<int>& value);
+std::string CreateInt32Array(Model* model, const std::string& param_name,
+                             const std::vector<int>& value);
 
 bool EstimateArithmeticOpsCount(const Model& model, const Operator& op,
                                 int64* result);
 bool EstimateArithmeticOpsCount(const Model& model, int64* result);
-string FormattedNumber(int64 x);
+std::string FormattedNumber(int64 x);
 
 int AxesCount(AxesOrder axes_order);
 
@@ -297,7 +299,7 @@ void ShuffleArray(const Shape& input_shape, AxesOrder input_axes_order,
 // that array. The idea is that we can't ever discard arrays that are either
 // an input or an output of the whole graph, or that appear in RNN back-edges,
 // as that would undercut explicit flags that the user might pass.
-bool IsDiscardableArray(const Model& model, const string& array_name);
+bool IsDiscardableArray(const Model& model, const std::string& array_name);
 
 void CheckFinalDataTypesSatisfied(const Model& model);
 
@@ -362,7 +364,7 @@ void CopyMinMaxAndQuantizationRelatedFields(const Array& src, Array* dst);
 
 // Delete Array if it's discardable and not referenced as input or output array
 // by any other op than the specified op.
-bool DeleteArrayIfUnusedOutsideOfOp(const string& array_name,
+bool DeleteArrayIfUnusedOutsideOfOp(const std::string& array_name,
                                     const Operator* op, Model* model);
 
 }  // namespace toco
diff --git a/tensorflow/lite/tools/benchmark/benchmark_model.cc b/tensorflow/lite/tools/benchmark/benchmark_model.cc
index 2a858e7a326..b67faa8c36b 100644
--- a/tensorflow/lite/tools/benchmark/benchmark_model.cc
+++ b/tensorflow/lite/tools/benchmark/benchmark_model.cc
@@ -229,6 +229,13 @@ TfLiteStatus BenchmarkModel::ParseFlags(int* argc, char** argv) {
     TFLITE_LOG(ERROR) << usage;
     return kTfLiteError;
   }
+
+  std::string unconsumed_args =
+      Flags::ArgsToString(*argc, const_cast<const char**>(argv));
+  if (!unconsumed_args.empty()) {
+    TFLITE_LOG(WARN) << "Unconsumed cmdline flags: " << unconsumed_args;
+  }
+
   return kTfLiteOk;
 }
 
diff --git a/tensorflow/lite/tools/benchmark/experimental/c/c_api_types.h b/tensorflow/lite/tools/benchmark/experimental/c/c_api_types.h
index 15823784d12..9093e5d50ad 100644
--- a/tensorflow/lite/tools/benchmark/experimental/c/c_api_types.h
+++ b/tensorflow/lite/tools/benchmark/experimental/c/c_api_types.h
@@ -375,6 +375,7 @@ typedef struct TfLiteSparsity {
 
 // An tensor in the interpreter system which is a wrapper around a buffer of
 // data including a dimensionality (or NULL if not currently defined).
+#ifndef TF_LITE_STATIC_MEMORY
 typedef struct TfLiteTensor {
   // The data type specification for data stored in `data`. This affects
   // what member of `data` union should be used.
@@ -439,6 +440,51 @@ typedef struct TfLiteTensor {
   // `dims_signature` contains [1, -1, -1, 3]).
   const TfLiteIntArray* dims_signature;
 } TfLiteTensor;
+#else
+// Specific reduced TfLiteTensor struct for TF Micro runtime. This struct
+// contains only the minimum fields required to initialize and prepare a micro
+// inference graph. The fields in this struct have been ordered from
+// largest-to-smallest for optimal struct sizeof.
+//
+// NOTE: This flag is opt-in only at compile time.
+typedef struct TfLiteTensor {
+  // TODO(b/155784997): Consider consolidating these quantization fields:
+  // Quantization information. Replaces params field above.
+  TfLiteQuantization quantization;
+
+  // Quantization information.
+  TfLiteQuantizationParams params;
+
+  // A union of data pointers. The appropriate type should be used for a typed
+  // tensor based on `type`.
+  TfLitePtrUnion data;
+
+  // A pointer to a structure representing the dimensionality interpretation
+  // that the buffer should have. NOTE: the product of elements of `dims`
+  // and the element datatype size should be equal to `bytes` below.
+  TfLiteIntArray* dims;
+
+  // The number of bytes required to store the data of this Tensor. I.e.
+  // (bytes of each element) * dims[0] * ... * dims[n-1].  For example, if
+  // type is kTfLiteFloat32 and dims = {3, 2} then
+  // bytes = sizeof(float) * 3 * 2 = 4 * 3 * 2 = 24.
+  size_t bytes;
+
+  // The data type specification for data stored in `data`. This affects
+  // what member of `data` union should be used.
+  TfLiteType type;
+
+  // How memory is mapped
+  //  kTfLiteMmapRo: Memory mapped read only.
+  //  i.e. weights
+  //  kTfLiteArenaRw: Arena allocated read write memory
+  //  (i.e. temporaries, outputs).
+  TfLiteAllocationType allocation_type;
+
+  // True if the tensor is a variable.
+  bool is_variable;
+} TfLiteTensor;
+#endif  // TF_LITE_STATIC_MEMORY
 
 #ifndef TF_LITE_STATIC_MEMORY
 // Free data memory of tensor `t`.
diff --git a/tensorflow/lite/tools/command_line_flags.cc b/tensorflow/lite/tools/command_line_flags.cc
index 92ddb1622c6..4f646ae27f4 100644
--- a/tensorflow/lite/tools/command_line_flags.cc
+++ b/tensorflow/lite/tools/command_line_flags.cc
@@ -325,6 +325,15 @@ std::string Flag::GetTypeName() const {
     usage_text << "\t" << flag.usage_text_ << "\n";
   }
   return usage_text.str();
-}  // namespace tflite
+}
+
+/*static*/ std::string Flags::ArgsToString(int argc, const char** argv) {
+  std::string args;
+  for (int i = 1; i < argc; ++i) {
+    args.append(argv[i]);
+    if (i != argc - 1) args.append(" ");
+  }
+  return args;
+}
 
 }  // namespace tflite
diff --git a/tensorflow/lite/tools/command_line_flags.h b/tensorflow/lite/tools/command_line_flags.h
index 95e64a19e18..4cc09f4b2c5 100644
--- a/tensorflow/lite/tools/command_line_flags.h
+++ b/tensorflow/lite/tools/command_line_flags.h
@@ -140,6 +140,9 @@ class Flags {
   // usage_text strings in flag_list[].
   static std::string Usage(const std::string& cmdline,
                            const std::vector<Flag>& flag_list);
+
+  // Return a space separated string containing argv[1, ..., argc-1].
+  static std::string ArgsToString(int argc, const char** argv);
 };
 }  // namespace tflite
 
diff --git a/tensorflow/lite/tools/command_line_flags_test.cc b/tensorflow/lite/tools/command_line_flags_test.cc
index 0216d7a0636..afd1264a0ad 100644
--- a/tensorflow/lite/tools/command_line_flags_test.cc
+++ b/tensorflow/lite/tools/command_line_flags_test.cc
@@ -356,5 +356,13 @@ TEST(CommandLineFlagsTest, DuplicateFlagsAndArgs) {
   EXPECT_EQ(argc, 2);
 }
 
+TEST(CommandLineFlagsTest, ArgsToString) {
+  int argc = 3;
+  const char* argv_strings[] = {"program_name", "--some_int=1", "--some_int=2"};
+  std::string args =
+      Flags::ArgsToString(argc, reinterpret_cast<const char**>(argv_strings));
+  EXPECT_EQ("--some_int=1 --some_int=2", args);
+}
+
 }  // namespace
 }  // namespace tflite
diff --git a/tensorflow/lite/tools/delegates/coreml_delegate_provider.cc b/tensorflow/lite/tools/delegates/coreml_delegate_provider.cc
index c6509618aee..7d88f04c8b4 100644
--- a/tensorflow/lite/tools/delegates/coreml_delegate_provider.cc
+++ b/tensorflow/lite/tools/delegates/coreml_delegate_provider.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #include "tensorflow/lite/tools/delegates/delegate_provider.h"
 #include "tensorflow/lite/tools/evaluation/utils.h"
 #if defined(__APPLE__)
+#include "TargetConditionals.h"
 #if TARGET_OS_IPHONE && !TARGET_IPHONE_SIMULATOR
 // Only enable metal delegate when using a real iPhone device.
 #define REAL_IPHONE_DEVICE
diff --git a/tensorflow/lite/tools/make/Makefile b/tensorflow/lite/tools/make/Makefile
index 266aa94ad93..f8b67fbbe7d 100644
--- a/tensorflow/lite/tools/make/Makefile
+++ b/tensorflow/lite/tools/make/Makefile
@@ -187,7 +187,7 @@ ifeq ($(TARGET_ARCH),aarch64)
 	BUILD_WITH_RUY=true
 endif
 ifeq ($(BUILD_WITH_RUY),true)
-  CXXFLAGS += -DTFLITE_WITH_RUY_ONLY
+  CXXFLAGS += -DTFLITE_WITH_RUY
 endif
 
 BUILD_WITH_RUY_PROFILER ?= false
diff --git a/tensorflow/lite/tools/make/build_ios_universal_lib.sh b/tensorflow/lite/tools/make/build_ios_universal_lib.sh
index 74bf9183541..87ca0649303 100755
--- a/tensorflow/lite/tools/make/build_ios_universal_lib.sh
+++ b/tensorflow/lite/tools/make/build_ios_universal_lib.sh
@@ -1,4 +1,4 @@
-#!/bin/bash -x
+#!/bin/bash +x
 # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -14,7 +14,15 @@
 # limitations under the License.
 # ==============================================================================
 
-set -e
+echo "========================================================================="
+echo "WARNING: This build script is deprecated and no longer maintained. Please"
+echo "         refer to the iOS build guide to learn how to build the latest   "
+echo "         version of TFLite static framework for iOS using bazel.         "
+echo "         https://www.tensorflow.org/lite/guide/build_ios                 "
+echo "========================================================================="
+sleep 5s
+
+set -ex
 
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 cd "$SCRIPT_DIR/../../../.."
diff --git a/tensorflow/lite/tools/optimize/BUILD b/tensorflow/lite/tools/optimize/BUILD
index 3011c01cdeb..146f869a906 100644
--- a/tensorflow/lite/tools/optimize/BUILD
+++ b/tensorflow/lite/tools/optimize/BUILD
@@ -286,6 +286,7 @@ tf_cc_test(
         "//tensorflow/lite/tools/optimize:testdata/maximum.bin",
         "//tensorflow/lite/tools/optimize:testdata/minimum.bin",
         "//tensorflow/lite/tools/optimize:testdata/mixed.bin",
+        "//tensorflow/lite/tools/optimize:testdata/mixed16x8.bin",
         "//tensorflow/lite/tools/optimize:testdata/multi_input_add_reshape.bin",
         "//tensorflow/lite/tools/optimize:testdata/pack.bin",
         "//tensorflow/lite/tools/optimize:testdata/single_avg_pool_min_minus_5_max_plus_5.bin",
@@ -296,6 +297,7 @@ tf_cc_test(
         "//tensorflow/lite/tools/optimize:testdata/split.bin",
         "//tensorflow/lite/tools/optimize:testdata/svdf_calibrated.bin",
         "//tensorflow/lite/tools/optimize:testdata/svdf_quantized.bin",
+        "//tensorflow/lite/tools/optimize:testdata/transpose.bin",
         "//tensorflow/lite/tools/optimize:testdata/unpack.bin",
     ],
     tags = [
diff --git a/tensorflow/lite/tools/optimize/calibration/builtin_logging_ops/lstm.cc b/tensorflow/lite/tools/optimize/calibration/builtin_logging_ops/lstm.cc
index 0d4c614511d..09ce81c1d97 100644
--- a/tensorflow/lite/tools/optimize/calibration/builtin_logging_ops/lstm.cc
+++ b/tensorflow/lite/tools/optimize/calibration/builtin_logging_ops/lstm.cc
@@ -62,11 +62,16 @@ inline void LstmStepWithAuxInput(
     const float* projection_weights_ptr, const float* projection_bias_ptr,
     const TfLiteLSTMParams* params, int n_batch, int n_cell, int n_input,
     int n_aux_input, int n_output, int output_batch_leading_dim,
-    float* output_state_ptr, float* cell_state_ptr, float* input_gate_scratch,
-    float* forget_gate_scratch, float* cell_gate_scratch,
-    float* output_gate_scratch, float* output_ptr, Logger* logger,
-    const std::vector<int>& intermediate_tensor_indexes,
+    float* output_state_ptr, float* cell_state_ptr, float* scratch0,
+    float* scratch1, float* scratch2, float* scratch3, float* output_ptr,
+    Logger* logger, const std::vector<int>& intermediate_tensor_indexes,
     ErrorReporter* error_reporter) {
+  // Make named scratch buffers for the different gates.
+  float* input_gate_scratch = scratch0;
+  float* forget_gate_scratch = scratch1;
+  float* cell_gate_scratch = scratch2;
+  float* output_gate_scratch = scratch3;
+
   // Since we have already checked that weights are all there or none, we can
   // check the existence of only one to the get the condition.
   const bool use_cifg = (input_to_input_weights_ptr == nullptr);
@@ -299,12 +304,11 @@ TfLiteStatus EvalFloat(
     const TfLiteTensor* aux_input_to_cell_weights,
     const TfLiteTensor* aux_input_to_output_weights,
     const TfLiteTensor* input_gate_bias, const TfLiteTensor* forget_gate_bias,
-    const TfLiteTensor* cell_bias, const TfLiteTensor* output_gate_bias,
+    const TfLiteTensor* cell_gate_bias, const TfLiteTensor* output_gate_bias,
     const TfLiteTensor* projection_weights, const TfLiteTensor* projection_bias,
     const TfLiteLSTMParams* params, bool forward_sequence, bool time_major,
-    int output_offset, TfLiteTensor* scratch_buffer,
-    TfLiteTensor* activation_state, TfLiteTensor* cell_state,
-    TfLiteTensor* output, Logger* logger,
+    int output_offset, TfLiteTensor* scratch_buffer, TfLiteTensor* output_state,
+    TfLiteTensor* cell_state, TfLiteTensor* output, Logger* logger,
     const std::vector<int>& intermediate_tensor_indexes,
     ErrorReporter* error_reporter) {
   TF_LITE_ASSERT(input->dims->size >= 2 && input->dims->size <= 3);
@@ -385,15 +389,15 @@ TfLiteStatus EvalFloat(
           GetTensorData<float>(output_layer_norm_coefficients),
           GetTensorData<float>(input_gate_bias),
           GetTensorData<float>(forget_gate_bias),
-          GetTensorData<float>(cell_bias),
+          GetTensorData<float>(cell_gate_bias),
           GetTensorData<float>(output_gate_bias),
           GetTensorData<float>(projection_weights),
           GetTensorData<float>(projection_bias), params, n_batch, n_cell,
           n_input, aux_input_size, n_output, output_batch_leading_dim,
-          GetTensorData<float>(activation_state),
-          GetTensorData<float>(cell_state), input_gate_scratch,
-          forget_gate_scratch, cell_gate_scratch, output_gate_scratch,
-          output_ptr_time, logger, intermediate_tensor_indexes, error_reporter);
+          GetTensorData<float>(output_state), GetTensorData<float>(cell_state),
+          input_gate_scratch, forget_gate_scratch, cell_gate_scratch,
+          output_gate_scratch, output_ptr_time, logger,
+          intermediate_tensor_indexes, error_reporter);
     }
   } else {
     for (int b = 0; b < n_batch; b++) {
@@ -414,9 +418,9 @@ TfLiteStatus EvalFloat(
         float* output_ptr = GetTensorData<float>(output) +
                             time_offset * output_step + output_offset;
 
-        // Offset the {activation,cell}_state pointers to the right batch.
-        float* activation_state_ptr = GetTensorData<float>(activation_state) +
-                                      b * output_batch_leading_dim;
+        // Offset the {output,cell}_state pointers to the right batch.
+        float* output_state_ptr =
+            GetTensorData<float>(output_state) + b * output_batch_leading_dim;
         float* cell_state_ptr = GetTensorData<float>(cell_state) + b * n_cell;
         // Offset the scratch pointers to the right batch.
         float* input_gate_scratch_ptr =
@@ -447,12 +451,12 @@ TfLiteStatus EvalFloat(
             GetTensorData<float>(output_layer_norm_coefficients),
             GetTensorData<float>(input_gate_bias),
             GetTensorData<float>(forget_gate_bias),
-            GetTensorData<float>(cell_bias),
+            GetTensorData<float>(cell_gate_bias),
             GetTensorData<float>(output_gate_bias),
             GetTensorData<float>(projection_weights),
             GetTensorData<float>(projection_bias), params, /*n_batch=*/1,
             n_cell, n_input, aux_input_size, n_output, output_batch_leading_dim,
-            activation_state_ptr, cell_state_ptr, input_gate_scratch_ptr,
+            output_state_ptr, cell_state_ptr, input_gate_scratch_ptr,
             forget_gate_scratch_ptr, cell_gate_scratch_ptr,
             output_gate_scratch_ptr, output_ptr, logger,
             intermediate_tensor_indexes, error_reporter);
@@ -528,7 +532,7 @@ TfLiteStatus lstm_eval(TfLiteContext* context, TfLiteNode* node, Logger* logger,
       context, node, ops::builtin::lstm::full::kInputGateBiasTensor);
   const TfLiteTensor* forget_gate_bias =
       GetInput(context, node, ops::builtin::lstm::full::kForgetGateBiasTensor);
-  const TfLiteTensor* cell_bias =
+  const TfLiteTensor* cell_gate_bias =
       GetInput(context, node, ops::builtin::lstm::full::kCellGateBiasTensor);
   const TfLiteTensor* output_gate_bias =
       GetInput(context, node, ops::builtin::lstm::full::kOutputGateBiasTensor);
@@ -541,11 +545,11 @@ TfLiteStatus lstm_eval(TfLiteContext* context, TfLiteNode* node, Logger* logger,
   // Index the scratch buffers pointers to the global scratch buffer.
   TfLiteTensor* scratch_buffer = GetTemporary(context, node, /*index=*/0);
 
-  TfLiteTensor* activation_state = GetVariableInput(
-      context, node, ops::builtin::lstm::full::kInputActivationStateTensor);
-  TF_LITE_ENSURE(context, activation_state != nullptr);
+  TfLiteTensor* output_state = GetVariableInput(
+      context, node, ops::builtin::lstm::full::kOutputStateTensor);
+  TF_LITE_ENSURE(context, output_state != nullptr);
   TfLiteTensor* cell_state = GetVariableInput(
-      context, node, ops::builtin::lstm::full::kInputCellStateTensor);
+      context, node, ops::builtin::lstm::full::kCellStateTensor);
   TF_LITE_ENSURE(context, cell_state != nullptr);
 
   TfLiteTensor* output =
@@ -571,11 +575,12 @@ TfLiteStatus lstm_eval(TfLiteContext* context, TfLiteNode* node, Logger* logger,
           /*aux_input_to_forget_weights=*/nullptr,
           /*aux_input_to_cell_weights=*/nullptr,
           /*aux_input_to_output_weights=*/nullptr, input_gate_bias,
-          forget_gate_bias, cell_bias, output_gate_bias, projection_weights,
-          projection_bias, params, /*forward_sequence=*/true,
+          forget_gate_bias, cell_gate_bias, output_gate_bias,
+          projection_weights, projection_bias, params,
+          /*forward_sequence=*/true,
           /*time_major=*/true,
-          /*output_offset=*/0, scratch_buffer, activation_state, cell_state,
-          output, logger, intermediate_tensor_indexes, error_reporter);
+          /*output_offset=*/0, scratch_buffer, output_state, cell_state, output,
+          logger, intermediate_tensor_indexes, error_reporter);
     }
     case kTfLiteUInt8:
     case kTfLiteInt8:
diff --git a/tensorflow/lite/tools/optimize/operator_property.cc b/tensorflow/lite/tools/optimize/operator_property.cc
index 3633cb63ace..f2cb98ef31a 100644
--- a/tensorflow/lite/tools/optimize/operator_property.cc
+++ b/tensorflow/lite/tools/optimize/operator_property.cc
@@ -74,11 +74,13 @@ OperatorProperty GetOperatorProperty(const ModelT* model, int subgraph_index,
       property.inputs = {{0, {}}, {1, {}}};
       property.outputs = {{0, {}}};
       property.version = 2;
+      property.quantize_input_as_activations = true;
       break;
     case BuiltinOperator_ARG_MAX:
       property.inputs = {{0, {}}};
       // ArgMax has no quantizable output.
       property.version = 2;
+      property.quantizable_int16 = false;
       break;
     case BuiltinOperator_AVERAGE_POOL_2D:
       property.inputs = {{0, {}}};
@@ -86,6 +88,12 @@ OperatorProperty GetOperatorProperty(const ModelT* model, int subgraph_index,
       property.restrict_same_input_output_scale = true;
       property.version = 2;
       break;
+    case BuiltinOperator_BATCH_MATMUL: {
+      property.inputs = {{0, {}}, {1, {}}};
+      property.outputs = {{0, {}}};
+      property.version = 2;
+      break;
+    }
     case BuiltinOperator_BATCH_TO_SPACE_ND:
     case BuiltinOperator_SPACE_TO_BATCH_ND:
     case BuiltinOperator_SPACE_TO_DEPTH:
@@ -94,6 +102,7 @@ OperatorProperty GetOperatorProperty(const ModelT* model, int subgraph_index,
       property.outputs = {{0, {}}};
       property.restrict_same_input_output_scale = true;
       property.version = 2;
+      property.quantizable_int16 = false;
       break;
     case BuiltinOperator_SPLIT:
       // We skip input 0 since it is the split dim which is not real valued.
@@ -159,6 +168,7 @@ OperatorProperty GetOperatorProperty(const ModelT* model, int subgraph_index,
       property.inputs = {{0, {}}, {1, {}}};
       // Comparisons have no quantizable outputs.
       property.version = 2;
+      property.quantizable_int16 = false;
       break;
     case BuiltinOperator_EXPAND_DIMS:
       // We skip input 1 as it is not real valued (it's the index of axis) and
@@ -181,11 +191,13 @@ OperatorProperty GetOperatorProperty(const ModelT* model, int subgraph_index,
       property.outputs = {{0, {}}};
       property.restrict_same_input_output_scale = true;
       property.version = 2;
+      property.quantizable_int16 = false;
       break;
     case BuiltinOperator_HARD_SWISH: {
       property.inputs = {{0, {}}};
       property.outputs = {{0, {}}};
       property.version = 1;
+      property.quantizable_int16 = false;
       break;
     }
     case BuiltinOperator_LOG_SOFTMAX: {
@@ -193,9 +205,10 @@ OperatorProperty GetOperatorProperty(const ModelT* model, int subgraph_index,
       // LogSoftmax requires output with 16/256 as scale and 127 as zero point.
       TensorProperty tensor_property;
       tensor_property.restriction = true;
-      tensor_property.restricted_value = {16.0f / 256.0f, 127};
+      tensor_property.restricted_value_int8 = {16.0f / 256.0f, 127};
       property.outputs = {{0, tensor_property}};
       property.version = 2;
+      property.quantizable_int16 = false;
       break;
     }
     case BuiltinOperator_LOGISTIC: {
@@ -203,7 +216,8 @@ OperatorProperty GetOperatorProperty(const ModelT* model, int subgraph_index,
       // Logistic requires output with 1/256 as scale and -128 as zero point.
       TensorProperty tensor_property;
       tensor_property.restriction = true;
-      tensor_property.restricted_value = {1 / 256.0f, -128};
+      tensor_property.restricted_value_int8 = {1 / 256.0f, -128};
+      tensor_property.restricted_value_int16 = {1 / 32768.0f, 0};
       property.outputs = {{0, tensor_property}};
       property.version = 2;
       break;
@@ -757,6 +771,7 @@ OperatorProperty GetOperatorProperty(const ModelT* model, int subgraph_index,
         property.restrict_scale = {{18, 0}};
         property.version = 2;
       }
+      property.quantizable_int16 = false;
       break;
     }
     case BuiltinOperator_L2_NORMALIZATION: {
@@ -764,9 +779,10 @@ OperatorProperty GetOperatorProperty(const ModelT* model, int subgraph_index,
       // L2 Norm requires output with 1/128 as scale and 0 as zero point.
       TensorProperty tensor_property;
       tensor_property.restriction = true;
-      tensor_property.restricted_value = {1 / 128.0f, 0};
+      tensor_property.restricted_value_int8 = {1 / 128.0f, 0};
       property.outputs = {{0, tensor_property}};
       property.version = 2;
+      property.quantizable_int16 = false;
       break;
     }
     case BuiltinOperator_MAX_POOL_2D:
@@ -779,28 +795,33 @@ OperatorProperty GetOperatorProperty(const ModelT* model, int subgraph_index,
       property.arbitrary_inputs = true;
       property.outputs = {{0, {}}};
       property.restrict_same_input_output_scale = true;
+      property.quantize_input_as_activations = true;
       property.version = 2;
       break;
     case BuiltinOperator_MEAN:
       property.inputs = {{0, {}}};
       property.outputs = {{0, {}}};
       property.version = 2;
+      property.quantizable_int16 = false;
       break;
     case BuiltinOperator_MINIMUM:
       property.arbitrary_inputs = true;
       property.outputs = {{0, {}}};
       property.restrict_same_input_output_scale = true;
+      property.quantize_input_as_activations = true;
       property.version = 2;
       break;
     case BuiltinOperator_MUL:
       property.inputs = {{0, {}}, {1, {}}};
       property.outputs = {{0, {}}};
+      property.quantize_input_as_activations = true;
       property.version = 2;
       break;
     case BuiltinOperator_PACK:
       property.arbitrary_inputs = true;
       property.outputs = {{0, {}}};
       property.restrict_same_input_output_scale = true;
+      property.restrict_same_input_output_scale = true;
       property.version = 2;
       break;
     case BuiltinOperator_PAD:
@@ -809,6 +830,7 @@ OperatorProperty GetOperatorProperty(const ModelT* model, int subgraph_index,
       property.outputs = {{0, {}}};
       property.restrict_same_input_output_scale = true;
       property.version = 2;
+      property.quantizable_int16 = false;
       break;
     case BuiltinOperator_QUANTIZE:
       property.inputs = {{0, {}}};
@@ -831,11 +853,13 @@ OperatorProperty GetOperatorProperty(const ModelT* model, int subgraph_index,
       property.inputs = {{0, {}}};
       property.outputs = {{0, {}}};
       property.version = 2;
+      property.quantizable_int16 = false;
       break;
     case BuiltinOperator_RELU_N1_TO_1:
       property.inputs = {{0, {}}};
       property.outputs = {{0, {}}};
       property.version = 1;
+      property.quantizable_int16 = false;
       break;
     case BuiltinOperator_RESHAPE:
       property.inputs = {{0, {}}};
@@ -849,6 +873,7 @@ OperatorProperty GetOperatorProperty(const ModelT* model, int subgraph_index,
       property.outputs = {{0, {}}};
       property.restrict_same_input_output_scale = true;
       property.version = 2;
+      property.quantizable_int16 = false;
       break;
     case BuiltinOperator_SHAPE:
       property.inputs = {{0, {}}};
@@ -874,7 +899,8 @@ OperatorProperty GetOperatorProperty(const ModelT* model, int subgraph_index,
       // Softmax requires output with 1/256 as scale and -128 as zero point.
       TensorProperty tensor_property;
       tensor_property.restriction = true;
-      tensor_property.restricted_value = {1 / 256.0f, -128};
+      tensor_property.restricted_value_int8 = {1 / 256.0f, -128};
+      tensor_property.restricted_value_int16 = {1 / 32768.0f, 0};
       property.outputs = {{0, tensor_property}};
       property.version = 2;
       break;
@@ -894,13 +920,15 @@ OperatorProperty GetOperatorProperty(const ModelT* model, int subgraph_index,
       property.inputs = {{0, {}}};
       property.outputs = {{0, {}}};
       property.version = 2;
+      property.quantizable_int16 = false;
       break;
     case BuiltinOperator_TANH: {
       property.inputs = {{0, {}}};
       // Tanh requires output with 1/128 as scale and 0 as zero point.
       TensorProperty tensor_property;
       tensor_property.restriction = true;
-      tensor_property.restricted_value = {1 / 128.0f, 0};
+      tensor_property.restricted_value_int8 = {1 / 128.0f, 0};
+      tensor_property.restricted_value_int16 = {1 / 32768.0f, 0};
       property.outputs = {{0, tensor_property}};
       property.version = 2;
       break;
@@ -926,6 +954,7 @@ OperatorProperty GetOperatorProperty(const ModelT* model, int subgraph_index,
                          {3, tensor_property_bias}};
       property.outputs = {{0, {}}};
       property.version = 3;
+      property.quantizable_int16 = false;
       break;
     }
     case BuiltinOperator_TRANSPOSE:
@@ -949,6 +978,7 @@ OperatorProperty GetOperatorProperty(const ModelT* model, int subgraph_index,
     default:
       // No quantized implementation exists for this operation.
       property.quantizable = false;
+      property.quantizable_int16 = false;
   }
   return property;
 }
diff --git a/tensorflow/lite/tools/optimize/operator_property.h b/tensorflow/lite/tools/optimize/operator_property.h
index 95b0e5000c3..ef84f3aaac1 100644
--- a/tensorflow/lite/tools/optimize/operator_property.h
+++ b/tensorflow/lite/tools/optimize/operator_property.h
@@ -43,7 +43,8 @@ struct TensorProperty {
   // Constraints.
   bool restriction = false;
   // scale/zero_point hardcoded.
-  std::pair<float, int> restricted_value = {0.0f, 0};
+  std::pair<float, int> restricted_value_int8 = {0.0f, 0};
+  std::pair<float, int> restricted_value_int16 = {0.0f, 0};
 
   // Use derived scale.
   bool use_derived_scale = false;
@@ -64,7 +65,8 @@ struct TensorProperty {
 struct OperatorProperty {
   // Is a quantized operations currently supported.
   bool quantizable = true;
-
+  // Is a quantized operations currently supported for 16x8
+  bool quantizable_int16 = true;
   // Op has arbitrary number of inputs, such as concat.
   bool arbitrary_inputs = false;
   // Op has arbitrary number of outputs, such as slice.
@@ -93,6 +95,13 @@ struct OperatorProperty {
 
   // Op version.
   int version = 1;
+
+  // When we quantize activations into 16 bit and weights into 8 bit,
+  // we want to quantize all inputs, including constant tensors,
+  // for the operators like Add, Mul into 16-bit as well. The constant
+  // inputs are quantized as weights and this variable indicates
+  // that we want to do quantizations of these tensors as activations.
+  bool quantize_input_as_activations = false;
 };
 
 OperatorProperty GetOperatorProperty(const ModelT* model, int subgraph_index,
diff --git a/tensorflow/lite/tools/optimize/quantization_utils.cc b/tensorflow/lite/tools/optimize/quantization_utils.cc
index 33bc4f44596..1f951c928ea 100644
--- a/tensorflow/lite/tools/optimize/quantization_utils.cc
+++ b/tensorflow/lite/tools/optimize/quantization_utils.cc
@@ -85,6 +85,42 @@ void GetAsymmetricQuantizationParams(
   quantization_params->zero_point = std::vector<int64_t>(1, zero_point);
 }
 
+void GetSymmetricQuantizationParams(
+    float min, float max, const int half_quant_range,
+    QuantizationParametersT* quantization_params) {
+  // Adjust the boundaries to guarantee 0 is included.
+  min = std::min(min, 0.0f);
+  max = std::max(max, 0.0f);
+  const float scale = std::max(std::abs(max), std::abs(min)) / half_quant_range;
+  quantization_params->min = std::vector<float>(1, min);
+  quantization_params->max = std::vector<float>(1, max);
+  quantization_params->scale = std::vector<float>(1, scale);
+  quantization_params->zero_point = std::vector<int64_t>(1, 0);
+}
+
+TfLiteStatus GetQuantizationParams(TensorT* tensor, TensorType activations_type,
+                                   QuantizationParametersT* quantization_params,
+                                   ErrorReporter* error_reporter) {
+  if (activations_type == TensorType_INT8) {
+    GetAsymmetricQuantizationParams(
+        tensor->quantization->min[0], tensor->quantization->max[0],
+        std::numeric_limits<int8_t>::min(), std::numeric_limits<int8_t>::max(),
+        quantization_params);
+  } else if (activations_type == TensorType_INT16) {
+    const float quantized_range = 32767.0;
+    GetSymmetricQuantizationParams(tensor->quantization->min[0],
+                                   tensor->quantization->max[0],
+                                   quantized_range, quantization_params);
+  } else {
+    TF_LITE_REPORT_ERROR(
+        error_reporter,
+        "Unsupported activation type for quantize-activation: %d",
+        activations_type);
+    return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
+
 // Set the max and min quantization parameter for a single tensor given its
 // values.
 void FillSingleMinMax(const float* const input, const uint64_t input_size,
@@ -548,6 +584,7 @@ TfLiteStatus SymmetricQuantizeTensorPerChannel(ModelT* model, TensorT* tensor,
                                model, tensor, error_reporter);
 }
 
+template <class BiasType>
 TfLiteStatus SymmetricPerLayerBiasQuantize(ModelT* model, TensorT* tensor,
                                            float scaling_factor,
                                            ErrorReporter* error_reporter) {
@@ -560,25 +597,38 @@ TfLiteStatus SymmetricPerLayerBiasQuantize(ModelT* model, TensorT* tensor,
   uint64_t num_elements;
   TF_LITE_ENSURE_STATUS(NumElements(*tensor, &num_elements));
 
-  std::vector<int32_t> final_buffer(num_elements);
-  const int32_t kScale = std::numeric_limits<int32_t>::max();
+  std::vector<BiasType> final_buffer(num_elements);
+  const BiasType kScale = std::numeric_limits<BiasType>::max();
 
   for (size_t i = 0; i < num_elements; i++) {
-    const int32_t quantized_value = tflite::SafeCast<int32_t>(
+    const BiasType quantized_value = tflite::SafeCast<BiasType>(
         TfLiteRound(float_data[i] * scaling_factor_inv));
     final_buffer[i] = std::min(kScale, std::max(-kScale, quantized_value));
   }
 
   // Set the buffers and output type.
   uint8_t* uint8_buffer = reinterpret_cast<uint8_t*>(final_buffer.data());
-  size_t buffer_size = num_elements * sizeof(int32_t);
+  size_t buffer_size = num_elements * sizeof(BiasType);
   std::vector<float> scales(1, scaling_factor);
   std::vector<int64_t> zero_points(1, 0);
+
+  auto output_type = std::is_same<BiasType, std::int32_t>::value
+                         ? TensorType_INT32
+                         : TensorType_INT64;
   return AddQuantizationParams(scales, zero_points, 0, uint8_buffer,
-                               buffer_size, TensorType_INT32, model, tensor,
+                               buffer_size, output_type, model, tensor,
                                error_reporter);
 }
 
+template TfLiteStatus SymmetricPerLayerBiasQuantize<std::int32_t>(
+    ModelT* model, TensorT* tensor, float scaling_factor,
+    ErrorReporter* error_reporter);
+
+template TfLiteStatus SymmetricPerLayerBiasQuantize<std::int64_t>(
+    ModelT* model, TensorT* tensor, float scaling_factor,
+    ErrorReporter* error_reporter);
+
+template <class BiasType>
 TfLiteStatus SymmetricPerChannelBiasQuantize(ModelT* model, TensorT* tensor,
                                              float input_scale,
                                              const float* weight_scales,
@@ -595,14 +645,14 @@ TfLiteStatus SymmetricPerChannelBiasQuantize(ModelT* model, TensorT* tensor,
   uint64_t num_elements;
   TF_LITE_ENSURE_STATUS(NumElements(*tensor, &num_elements));
 
-  std::vector<int32_t> final_buffer(num_elements);
-  const int32_t kScale = std::numeric_limits<int32_t>::max();
+  std::vector<BiasType> final_buffer(num_elements);
+  const BiasType kScale = std::numeric_limits<BiasType>::max();
 
   for (int32_t channel_idx = 0; channel_idx < number_of_dimension;
        channel_idx++) {
     float scaling_factor = scales[channel_idx];
     float scaling_factor_inv = (scaling_factor == 0) ? 0 : 1.0 / scaling_factor;
-    const int32_t quantized_value = tflite::SafeCast<int32_t>(
+    const BiasType quantized_value = tflite::SafeCast<BiasType>(
         TfLiteRound(float_data[channel_idx] * scaling_factor_inv));
     final_buffer[channel_idx] =
         std::min(kScale, std::max(-kScale, quantized_value));
@@ -610,12 +660,26 @@ TfLiteStatus SymmetricPerChannelBiasQuantize(ModelT* model, TensorT* tensor,
 
   // Set the buffers and output type.
   uint8_t* uint8_buffer = reinterpret_cast<uint8_t*>(final_buffer.data());
-  size_t buffer_size = num_elements * sizeof(int32_t);
+  size_t buffer_size = num_elements * sizeof(BiasType);
   std::vector<int64_t> zero_point(scales.size(), 0);
+
+  auto output_type = std::is_same<BiasType, std::int32_t>::value
+                         ? TensorType_INT32
+                         : TensorType_INT64;
   return AddQuantizationParams(scales, zero_point, 0, uint8_buffer, buffer_size,
-                               TensorType_INT32, model, tensor, error_reporter);
+                               output_type, model, tensor, error_reporter);
 }
 
+template TfLiteStatus SymmetricPerChannelBiasQuantize<std::int64_t>(
+    ModelT* model, TensorT* tensor, float input_scale,
+    const float* weight_scales, int number_of_dimension,
+    ErrorReporter* error_reporter);
+
+template TfLiteStatus SymmetricPerChannelBiasQuantize<std::int32_t>(
+    ModelT* model, TensorT* tensor, float input_scale,
+    const float* weight_scales, int number_of_dimension,
+    ErrorReporter* error_reporter);
+
 TfLiteStatus QuantizeWeight(ModelT* model, TensorT* tensor, bool per_channel,
                             int per_axis_index, ErrorReporter* error_reporter) {
   // TODO(suharshs): Currently we conflate quantizing weights and constants. Its
@@ -657,12 +721,12 @@ float GetEffectiveScale(ModelT* model, SubGraphT* subgraph, int op_idx,
   return scale;
 }
 
-void QuantizeActivation(TensorT* tensor) {
-  GetAsymmetricQuantizationParams(
-      tensor->quantization->min[0], tensor->quantization->max[0],
-      std::numeric_limits<int8_t>::min(), std::numeric_limits<int8_t>::max(),
-      tensor->quantization.get());
-  tensor->type = TensorType_INT8;
+TfLiteStatus QuantizeActivation(TensorT* tensor, TensorType activations_type,
+                                ErrorReporter* error_reporter) {
+  TF_LITE_ENSURE_STATUS(GetQuantizationParams(
+      tensor, activations_type, tensor->quantization.get(), error_reporter));
+  tensor->type = activations_type;
+  return kTfLiteOk;
 }
 
 TfLiteStatus QuantizeActivationToInt16(TensorT* tensor, float scale) {
diff --git a/tensorflow/lite/tools/optimize/quantization_utils.h b/tensorflow/lite/tools/optimize/quantization_utils.h
index 18ed707e175..752b4253250 100644
--- a/tensorflow/lite/tools/optimize/quantization_utils.h
+++ b/tensorflow/lite/tools/optimize/quantization_utils.h
@@ -113,12 +113,14 @@ TfLiteStatus SymmetricQuantizeFloatsToInt16(ModelT* model, TensorT* tensor,
                                             ErrorReporter* error_reporter);
 
 // Symmetrically quantized the bias for per-layer ops (i.e. FullyConnected).
+template <typename BiasType>
 TfLiteStatus SymmetricPerLayerBiasQuantize(ModelT* model, TensorT* tensor,
                                            float scaling_factor,
                                            ErrorReporter* error_reporter);
 
 // Symmetrically quantizes the bias for ops like Conv and DepthwiseConv.
 // The scale of bias if weight_per_channel_scale[channel] * input_scale.
+template <typename BiasType>
 TfLiteStatus SymmetricPerChannelBiasQuantize(ModelT* model, TensorT* tensor,
                                              float input_scale,
                                              const float* weight_scales,
@@ -135,8 +137,14 @@ float GetEffectiveScale(ModelT* model, SubGraphT* subgraph, int op_idx,
                         std::vector<int> intermediate_index,
                         std::vector<float> factors);
 
+// Return quantization parameters depending on activations type.
+TfLiteStatus GetQuantizationParams(TensorT* tensor, TensorType activations_type,
+                                   QuantizationParametersT* quantization_params,
+                                   ErrorReporter* error_reporter);
+
 // Quantize activation.
-void QuantizeActivation(TensorT* tensor);
+TfLiteStatus QuantizeActivation(TensorT* tensor, TensorType activations_type,
+                                ErrorReporter* error_reporter);
 
 // Quantize activation to 16bit.
 TfLiteStatus QuantizeActivationToInt16(TensorT* tensor, float scale);
diff --git a/tensorflow/lite/tools/optimize/quantization_utils_test.cc b/tensorflow/lite/tools/optimize/quantization_utils_test.cc
index ece0123d166..49009e49600 100644
--- a/tensorflow/lite/tools/optimize/quantization_utils_test.cc
+++ b/tensorflow/lite/tools/optimize/quantization_utils_test.cc
@@ -701,7 +701,7 @@ TEST_F(QuantizationUtilsTest, SymmetricPerLayerBiasQuantize) {
   model->buffers.push_back(std::move(buffer));
 
   // Call and verify.
-  EXPECT_EQ(SymmetricPerLayerBiasQuantize(
+  EXPECT_EQ(SymmetricPerLayerBiasQuantize<int32_t>(
                 model.get(), model->subgraphs[0]->tensors[0].get(),
                 input_scale * weight_scale, &error_reporter_),
             kTfLiteOk);
@@ -759,7 +759,7 @@ TEST_F(QuantizationUtilsTest, SymmetricPerChannelBiasQuantize) {
   model->buffers.push_back(std::move(buffer));
 
   // Call and verify.
-  EXPECT_EQ(SymmetricPerChannelBiasQuantize(
+  EXPECT_EQ(SymmetricPerChannelBiasQuantize<int32_t>(
                 model.get(), model->subgraphs[0]->tensors[0].get(), input_scale,
                 weight_scales.data(), 2, &error_reporter_),
             kTfLiteOk);
diff --git a/tensorflow/lite/tools/optimize/quantization_wrapper.cc b/tensorflow/lite/tools/optimize/quantization_wrapper.cc
index bd3331da6bf..56416c894ea 100644
--- a/tensorflow/lite/tools/optimize/quantization_wrapper.cc
+++ b/tensorflow/lite/tools/optimize/quantization_wrapper.cc
@@ -42,7 +42,9 @@ bool CreateQuantizedModel(const std::string& path) {
   tflite::StderrReporter error_reporter;
   if (tflite::optimize::QuantizeModel(
           &builder, &model, tflite::TensorType_FLOAT32,
-          tflite::TensorType_FLOAT32, &error_reporter) != kTfLiteOk) {
+          tflite::TensorType_FLOAT32,
+          // TODO(b/159351372): Pass required activation type if needed
+          tflite::TensorType_INT8, &error_reporter) != kTfLiteOk) {
     return false;
   }
   return WriteFile(path, builder.GetBufferPointer(), builder.GetSize());
diff --git a/tensorflow/lite/tools/optimize/quantize_model.cc b/tensorflow/lite/tools/optimize/quantize_model.cc
index c15808c532c..bb1deb695b9 100644
--- a/tensorflow/lite/tools/optimize/quantize_model.cc
+++ b/tensorflow/lite/tools/optimize/quantize_model.cc
@@ -52,13 +52,17 @@ bool IsFloatTensor(const SubGraphT* subgraph, int32_t tensor_idx) {
 // operator_names.
 operator_property::OperatorProperty GetOperatorProperty(
     const std::unordered_set<string>& operator_names, const ModelT* model,
-    int subgraph_index, int op_idx, const string& operator_name) {
+    int subgraph_index, int op_idx, const string& operator_name,
+    const TensorType& activations_type) {
   operator_property::OperatorProperty property =
       operator_property::GetOperatorProperty(model, subgraph_index, op_idx);
   const SubGraphT* subgraph = model->subgraphs[subgraph_index].get();
   const OperatorT* op = subgraph->operators[op_idx].get();
   const BuiltinOperator op_code =
       model->operator_codes[op->opcode_index]->builtin_code;
+  if (activations_type == TensorType_INT16 && !property.quantizable_int16) {
+    property.quantizable = false;
+  }
   // The algorithm adds Dequantize and Quantize, so we don't require them to be
   // in the operator_names.
   if (op_code != BuiltinOperator_DEQUANTIZE &&
@@ -78,7 +82,8 @@ bool IsRealValueOp(const std::unordered_set<string>& real_value_op_set,
 // Creates a set that contains all quantizable ops that happen to take a
 // non-float type in the source graph.
 std::unordered_set<string> PopulateRealValueOpSet(
-    ModelT* model, const std::unordered_set<string>& operator_names) {
+    ModelT* model, const std::unordered_set<string>& operator_names,
+    const TensorType& activations_type) {
   std::unordered_set<string> real_value_op_set;
   for (size_t subgraph_idx = 0; subgraph_idx < model->subgraphs.size();
        subgraph_idx++) {
@@ -86,8 +91,9 @@ std::unordered_set<string> PopulateRealValueOpSet(
     for (size_t op_idx = 0; op_idx < subgraph->operators.size(); op_idx++) {
       OperatorT* op = subgraph->operators[op_idx].get();
       const string operator_name = subgraph->tensors[op->outputs[0]]->name;
-      operator_property::OperatorProperty property = GetOperatorProperty(
-          operator_names, model, subgraph_idx, op_idx, operator_name);
+      operator_property::OperatorProperty property =
+          GetOperatorProperty(operator_names, model, subgraph_idx, op_idx,
+                              operator_name, activations_type);
 
       if (!property.quantizable) {
         real_value_op_set.insert(operator_name);
@@ -134,6 +140,7 @@ std::unordered_set<string> PopulateRealValueOpSet(
 TfLiteStatus QuantizeBias(ModelT* model, const TensorT* input_tensor,
                           const TensorT* weight_tensor, TensorT* bias_tensor,
                           bool is_per_channel, int channel_dim_index,
+                          const TensorType& activations_type,
                           ErrorReporter* error_reporter) {
   if (bias_tensor->shape.size() != 1) {
     TF_LITE_REPORT_ERROR(error_reporter, "Expected bias tensor shape to be 1.");
@@ -165,9 +172,15 @@ TfLiteStatus QuantizeBias(ModelT* model, const TensorT* input_tensor,
                            weight_scales.size());
       return kTfLiteError;
     }
-    return utils::SymmetricPerChannelBiasQuantize(
-        model, bias_tensor, input_tensor->quantization->scale[0],
-        weight_scales.data(), channel_dim_size, error_reporter);
+    if (activations_type == tflite::TensorType_INT16) {
+      return utils::SymmetricPerChannelBiasQuantize<std::int64_t>(
+          model, bias_tensor, input_tensor->quantization->scale[0],
+          weight_scales.data(), channel_dim_size, error_reporter);
+    } else {
+      return utils::SymmetricPerChannelBiasQuantize<std::int32_t>(
+          model, bias_tensor, input_tensor->quantization->scale[0],
+          weight_scales.data(), channel_dim_size, error_reporter);
+    }
   } else {
     if (weight_scales.size() != 1) {
       TF_LITE_REPORT_ERROR(
@@ -176,40 +189,54 @@ TfLiteStatus QuantizeBias(ModelT* model, const TensorT* input_tensor,
           weight_scales.size());
       return kTfLiteError;
     }
-    return utils::SymmetricPerLayerBiasQuantize(
-        model, bias_tensor,
-        input_tensor->quantization->scale[0] * weight_scales[0],
-        error_reporter);
+    if (activations_type == tflite::TensorType_INT16) {
+      return utils::SymmetricPerLayerBiasQuantize<std::int64_t>(
+          model, bias_tensor,
+          input_tensor->quantization->scale[0] * weight_scales[0],
+          error_reporter);
+    } else {
+      return utils::SymmetricPerLayerBiasQuantize<std::int32_t>(
+          model, bias_tensor,
+          input_tensor->quantization->scale[0] * weight_scales[0],
+          error_reporter);
+    }
   }
   return kTfLiteError;
 }
 
 // True if the tensor type has to be modified.
 bool TensorTypeChangeRequired(const TensorT* tensor, const TensorType& type) {
-  // The quantized model is type INT8, so if the user provided type is INT8, we
-  // do not have to do any custom logic. Additionally, if the current tensor
-  // isn't INT8 quantized, the custom type doesn't apply.
-  return (type != TensorType_INT8 && tensor->type == TensorType_INT8 &&
-          !tensor->quantization->scale.empty());
+  // The quantized model is type INT8/INT16, so if the user provided type is
+  // INT8/INT16, we do not have to do any custom logic. Additionally, if the
+  // current tensor isn't INT8/INT16 quantized, the custom type doesn't apply.
+  bool int8check = type != TensorType_INT8 && tensor->type == TensorType_INT8 &&
+                   !tensor->quantization->scale.empty();
+  bool int16check = type != TensorType_INT16 &&
+                    tensor->type == TensorType_INT16 &&
+                    !tensor->quantization->scale.empty();
+  return (int8check || int16check);
 }
 
 // Sets the input type, adding a Leading Op node at the start of the model if
 // necessary.
 // Returns the new input tensor index.
 int32_t SetInputType(ModelT* model, SubGraphT* subgraph,
-                     const int32_t tensor_idx, const TensorType& input_type) {
+                     const int32_t tensor_idx, const TensorType& input_type,
+                     const TensorType& activations_type) {
   TensorT* tensor = subgraph->tensors[tensor_idx].get();
   if (!TensorTypeChangeRequired(tensor, input_type)) {
     return -1;
   }
   if (input_type == TensorType_FLOAT32 || input_type == TensorType_UINT8) {
+    std::string type_string =
+        activations_type == TensorType_INT16 ? "int16" : "int8";
     // Create a new tensor to be the input of the leading Op.
     std::unique_ptr<TensorT> leading_op_input;
     if (input_type == TensorType_FLOAT32) {
       // Add tensor for quantize operator. Scales and zero points are not
       // needed.
       const string leading_op_name = tensor->name;
-      const string new_name_original_input = tensor->name + "_int8";
+      const string new_name_original_input = tensor->name + "_" + type_string;
       tensor->name = new_name_original_input;
       utils::MakeTensor(leading_op_name, tensor->shape, tensor->shape_signature,
                         input_type, &leading_op_input);
@@ -224,7 +251,7 @@ int32_t SetInputType(ModelT* model, SubGraphT* subgraph,
       TFLITE_DCHECK_GE(zero_point, -128);
       TFLITE_DCHECK_LE(zero_point, 127);
       const string leading_op_name = tensor->name;
-      const string new_name_original_input = tensor->name + "_int8";
+      const string new_name_original_input = tensor->name + "_" + type_string;
       tensor->name = new_name_original_input;
       utils::MakeTensorWithQuantParam(
           leading_op_name, tensor->shape, tensor->shape_signature, input_type,
@@ -251,17 +278,20 @@ int32_t SetInputType(ModelT* model, SubGraphT* subgraph,
 // necessary.
 // Returns the new output tensor index.
 int32_t SetOutputType(ModelT* model, SubGraphT* subgraph,
-                      const int32_t tensor_idx, const TensorType& output_type) {
+                      const int32_t tensor_idx, const TensorType& output_type,
+                      const TensorType& activations_type) {
   TensorT* tensor = subgraph->tensors[tensor_idx].get();
   if (!TensorTypeChangeRequired(tensor, output_type)) {
     return -1;
   }
   if (output_type == TensorType_FLOAT32 || output_type == TensorType_UINT8) {
+    std::string type_string =
+        activations_type == TensorType_INT16 ? "int16" : "int8";
     // Create a new tensor to be the output of the tailing op.
     std::unique_ptr<TensorT> tailing_op_output;
     if (output_type == TensorType_FLOAT32) {
       const string tailing_op_name = tensor->name;
-      const string new_name_original_output = tensor->name + "_int8";
+      const string new_name_original_output = tensor->name + "_" + type_string;
       tensor->name = new_name_original_output;
       utils::MakeTensor(tailing_op_name, tensor->shape, tensor->shape_signature,
                         output_type, &tailing_op_output);
@@ -276,7 +306,7 @@ int32_t SetOutputType(ModelT* model, SubGraphT* subgraph,
       TFLITE_DCHECK_GE(zero_point, -128);
       TFLITE_DCHECK_LE(zero_point, 127);
       const string tailing_op_name = tensor->name;
-      const string new_name_original_output = tensor->name + "_int8";
+      const string new_name_original_output = tensor->name + "_" + type_string;
       tensor->name = new_name_original_output;
       utils::MakeTensorWithQuantParam(
           tailing_op_name, tensor->shape, tensor->shape_signature, output_type,
@@ -312,6 +342,7 @@ int32_t SetOutputType(ModelT* model, SubGraphT* subgraph,
 // uint8, can be thought as "requant").
 TfLiteStatus SetInputAndOutputTypes(ModelT* model, const TensorType& input_type,
                                     const TensorType& output_type,
+                                    const TensorType& activations_type,
                                     ErrorReporter* error_reporter) {
   for (int subgraph_idx = 0; subgraph_idx < model->subgraphs.size();
        subgraph_idx++) {
@@ -328,8 +359,8 @@ TfLiteStatus SetInputAndOutputTypes(ModelT* model, const TensorType& input_type,
             EnumNameTensorType(tensor->type));
         return kTfLiteError;
       }
-      const int32_t input_idx =
-          SetInputType(model, subgraph, subgraph->inputs[i], input_type);
+      const int32_t input_idx = SetInputType(
+          model, subgraph, subgraph->inputs[i], input_type, activations_type);
       if (input_idx < 0) {
         continue;
       }
@@ -346,8 +377,8 @@ TfLiteStatus SetInputAndOutputTypes(ModelT* model, const TensorType& input_type,
             EnumNameTensorType(tensor->type));
         return kTfLiteError;
       }
-      const int32_t output_idx =
-          SetOutputType(model, subgraph, subgraph->outputs[i], output_type);
+      const int32_t output_idx = SetOutputType(
+          model, subgraph, subgraph->outputs[i], output_type, activations_type);
       if (output_idx < 0) {
         continue;
       }
@@ -364,7 +395,7 @@ TfLiteStatus SetInputAndOutputTypes(ModelT* model, const TensorType& input_type,
 TfLiteStatus ApplyConstraints(
     ModelT* model, const std::unordered_set<string>& operator_names,
     const std::unordered_set<string>& real_value_op_set,
-    ErrorReporter* error_reporter) {
+    TensorType activations_type, ErrorReporter* error_reporter) {
   for (int subgraph_idx = 0; subgraph_idx < model->subgraphs.size();
        subgraph_idx++) {
     SubGraphT* subgraph = model->subgraphs.at(subgraph_idx).get();
@@ -372,8 +403,9 @@ TfLiteStatus ApplyConstraints(
     for (int op_idx = subgraph->operators.size() - 1; op_idx >= 0; op_idx--) {
       OperatorT* op = subgraph->operators[op_idx].get();
       const string operator_name = subgraph->tensors[op->outputs[0]]->name;
-      operator_property::OperatorProperty property = GetOperatorProperty(
-          operator_names, model, subgraph_idx, op_idx, operator_name);
+      operator_property::OperatorProperty property =
+          GetOperatorProperty(operator_names, model, subgraph_idx, op_idx,
+                              operator_name, activations_type);
       if (!property.quantizable ||
           !IsRealValueOp(real_value_op_set, operator_name)) {
         continue;
@@ -413,7 +445,7 @@ TfLiteStatus ApplyConstraints(
         const string requant_tensor_name = input_tensor->name + "_requantized";
         utils::MakeTensorWithQuantParam(
             requant_tensor_name, input_tensor->shape,
-            input_tensor->shape_signature, TensorType_INT8, output_scale,
+            input_tensor->shape_signature, activations_type, output_scale,
             output_zp, &additional_tensor);
         const int32_t additional_tensor_idx = subgraph->tensors.size();
         subgraph->tensors.push_back(std::move(additional_tensor));
@@ -463,7 +495,8 @@ std::vector<std::pair<int, operator_property::TensorProperty>> GetOutputs(
 
 bool ShouldRestrictSameInputOutputScale(
     operator_property::OperatorProperty property) {
-  // Ops with multiple inputs (i.e. concat) gets restricted in ApplyConstraints.
+  // Ops with multiple inputs (i.e. concat, max and min) gets restricted in
+  // ApplyConstraints.
   return (!property.arbitrary_inputs &&
           property.restrict_same_input_output_scale);
 }
@@ -482,7 +515,7 @@ TfLiteStatus QuantizeOpInput(
     ModelT* model, int32_t subgraph_idx, size_t* op_idx,
     operator_property::OperatorProperty property,
     const std::pair<int32_t, operator_property::TensorProperty>& input,
-    ErrorReporter* error_reporter) {
+    const TensorType& activations_type, ErrorReporter* error_reporter) {
   int32_t input_idx = input.first;
   operator_property::TensorProperty tensor_property = input.second;
   SubGraphT* subgraph = model->subgraphs.at(subgraph_idx).get();
@@ -511,7 +544,9 @@ TfLiteStatus QuantizeOpInput(
     if (utils::HasBuffer(model, subgraph, tensor_idx)) {
       // TODO(suharshs): Look at consumers, throw error if one consumer is
       // per-channel and one per-layer.
-      if (tensor_property.number_of_bits == 8) {
+      bool quantize_const_input = property.quantize_input_as_activations &&
+                                  activations_type == TensorType_INT16;
+      if (tensor_property.number_of_bits == 8 && !quantize_const_input) {
         if (tensor_property.use_derived_scale) {
           // Currently 8bit tensors in input do not accept derived scale.
           return kTfLiteError;
@@ -527,7 +562,7 @@ TfLiteStatus QuantizeOpInput(
               *op_idx);
           return kTfLiteError;
         }
-      } else if (tensor_property.number_of_bits == 16) {
+      } else if (tensor_property.number_of_bits == 16 || quantize_const_input) {
         if (tensor_property.use_derived_scale) {
           // Currently 16bit tensors in input do not accept derived scale.
           return kTfLiteError;
@@ -559,8 +594,8 @@ TfLiteStatus QuantizeOpInput(
             tensor_property.derived_scale.input_tensors,
             tensor_property.derived_scale.intermediate_tensors,
             tensor_property.derived_scale.factors);
-        return utils::SymmetricPerLayerBiasQuantize(model, tensor, scale,
-                                                    error_reporter);
+        return utils::SymmetricPerLayerBiasQuantize<std::int32_t>(
+            model, tensor, scale, error_reporter);
 
       } else if (tensor_property.number_of_bits == 10) {
         // When the number of bits is 10 (instead of 16), quantize the tensor to
@@ -598,7 +633,8 @@ TfLiteStatus QuantizeOpInput(
             // Currently 8bit tensors in input do not accept derived scale.
             return kTfLiteError;
           }
-          utils::QuantizeActivation(tensor);
+          TF_LITE_ENSURE_STATUS(utils::QuantizeActivation(
+              tensor, activations_type, error_reporter));
         } else if (tensor_property.number_of_bits == 16) {
           TensorT* tensor = subgraph->tensors[tensor_idx].get();
           float quantized_range = 32767.0;
@@ -616,13 +652,17 @@ TfLiteStatus QuantizeOpInput(
       } else {
         // If the tensor is not a model input, we need to add a Quantize
         // operation since the preceding op may require a float output.
+        std::string type_string =
+            activations_type == TensorType_INT16 ? "int16" : "int8";
         std::unique_ptr<TensorT> op_output;
-        utils::MakeTensor(tensor->name + "_int8", tensor->shape,
-                          tensor->shape_signature, TensorType_INT8, &op_output);
+        utils::MakeTensor(tensor->name + "_" + type_string, tensor->shape,
+                          tensor->shape_signature, activations_type,
+                          &op_output);
         op_output->quantization = absl::make_unique<QuantizationParametersT>();
         op_output->quantization->min.push_back(tensor->quantization->min[0]);
         op_output->quantization->max.push_back(tensor->quantization->max[0]);
-        utils::QuantizeActivation(op_output.get());
+        TF_LITE_ENSURE_STATUS(utils::QuantizeActivation(
+            op_output.get(), activations_type, error_reporter));
         const int32_t quant_op_output_idx = subgraph->tensors.size();
         subgraph->tensors.push_back(std::move(op_output));
         std::unique_ptr<OperatorT> quant_op;
@@ -665,7 +705,7 @@ TfLiteStatus QuantizeOpOutput(
     ModelT* model, int32_t subgraph_idx, int32_t op_idx,
     operator_property::OperatorProperty property,
     const std::pair<int32_t, operator_property::TensorProperty>& output,
-    ErrorReporter* error_reporter) {
+    TensorType activations_type, ErrorReporter* error_reporter) {
   int32_t output_idx = output.first;
   operator_property::TensorProperty tensor_property = output.second;
   // If the operator is not quantizable, we don't need to do anything for the
@@ -732,18 +772,22 @@ TfLiteStatus QuantizeOpOutput(
       const float max = input_tensor->quantization->max[0];
       output_tensor->quantization->max = {max};
     }
-    output_tensor->type = TensorType_INT8;
+    output_tensor->type = activations_type;
   } else if (tensor_property.restriction) {
-    const auto scale_and_zp = tensor_property.restricted_value;
+    const auto scale_and_zp = activations_type == TensorType_INT16
+                                  ? tensor_property.restricted_value_int16
+                                  : tensor_property.restricted_value_int8;
+
     // Apply to output.
     output_tensor->quantization = absl::make_unique<QuantizationParametersT>();
     output_tensor->quantization->scale.push_back(scale_and_zp.first);
     output_tensor->quantization->zero_point.push_back(scale_and_zp.second);
-    output_tensor->type = TensorType_INT8;
+    output_tensor->type = activations_type;
   } else {
     // Process regular output that doesn't have any restrictions.
     if (utils::HasMinMax(output_tensor)) {
-      utils::QuantizeActivation(output_tensor);
+      utils::QuantizeActivation(output_tensor, activations_type,
+                                error_reporter);
     } else {
       TF_LITE_REPORT_ERROR(
           error_reporter,
@@ -757,6 +801,7 @@ TfLiteStatus QuantizeOpOutput(
 }
 
 TfLiteStatus QuantizeIntemediateTensors(ModelT* model,
+                                        TensorType activations_type,
                                         ErrorReporter* error_reporter) {
   for (size_t subgraph_idx = 0; subgraph_idx < model->subgraphs.size();
        subgraph_idx++) {
@@ -780,7 +825,8 @@ TfLiteStatus QuantizeIntemediateTensors(ModelT* model,
               input.second.symmetric == false) {
             TensorT* tensor = subgraph->tensors[index_global].get();
             if (utils::HasMinMax(tensor)) {
-              utils::QuantizeActivation(tensor);
+              utils::QuantizeActivation(tensor, activations_type,
+                                        error_reporter);
             } else {
               TF_LITE_REPORT_ERROR(
                   error_reporter,
@@ -884,7 +930,7 @@ TfLiteStatus QuantizeWeightsInputOutput(
     ModelT* model, bool allow_float,
     const std::unordered_set<string>& operator_names,
     const std::unordered_set<string>& real_value_op_set,
-    ErrorReporter* error_reporter) {
+    const TensorType& activations_type, ErrorReporter* error_reporter) {
   for (size_t subgraph_idx = 0; subgraph_idx < model->subgraphs.size();
        subgraph_idx++) {
     SubGraphT* subgraph = model->subgraphs.at(subgraph_idx).get();
@@ -893,15 +939,23 @@ TfLiteStatus QuantizeWeightsInputOutput(
       const BuiltinOperator op_code =
           model->operator_codes[op->opcode_index]->builtin_code;
       const string operator_name = subgraph->tensors[op->outputs[0]]->name;
-      operator_property::OperatorProperty property = GetOperatorProperty(
-          operator_names, model, subgraph_idx, op_idx, operator_name);
+      operator_property::OperatorProperty property =
+          GetOperatorProperty(operator_names, model, subgraph_idx, op_idx,
+                              operator_name, activations_type);
       if (!IsRealValueOp(real_value_op_set, operator_name)) {
         continue;
       }
 
-      if (!property.quantizable && !allow_float) {
+      if (activations_type == TensorType_INT16 && !property.quantizable &&
+          !allow_float) {
+        TF_LITE_REPORT_ERROR(
+            error_reporter,
+            "Quantization to 16x8-bit not yet supported for op: %",
+            EnumNameBuiltinOperator(op_code));
+        return kTfLiteError;
+      } else if (!property.quantizable && !allow_float) {
         TF_LITE_REPORT_ERROR(error_reporter,
-                             "Quantization not yet supported for op: %s",
+                             "Quantization not yet supported for op: %",
                              EnumNameBuiltinOperator(op_code));
         return kTfLiteError;
       }
@@ -910,14 +964,16 @@ TfLiteStatus QuantizeWeightsInputOutput(
       for (const std::pair<int, operator_property::TensorProperty>& input :
            GetInputs(op, property)) {
         TF_LITE_ENSURE_STATUS(QuantizeOpInput(model, subgraph_idx, &op_idx,
-                                              property, input, error_reporter));
+                                              property, input, activations_type,
+                                              error_reporter));
       }
 
       // Quantize operator outputs.
       for (const std::pair<int, operator_property::TensorProperty>& output :
            GetOutputs(op, property)) {
-        TF_LITE_ENSURE_STATUS(QuantizeOpOutput(
-            model, subgraph_idx, op_idx, property, output, error_reporter));
+        TF_LITE_ENSURE_STATUS(
+            QuantizeOpOutput(model, subgraph_idx, op_idx, property, output,
+                             activations_type, error_reporter));
       }
     }
   }
@@ -928,6 +984,7 @@ TfLiteStatus QuantizeWeightsInputOutput(
 TfLiteStatus QuantizeBiases(ModelT* model,
                             const std::unordered_set<string>& operator_names,
                             const std::unordered_set<string>& real_value_op_set,
+                            const TensorType& activations_type,
                             ErrorReporter* error_reporter) {
   for (size_t subgraph_idx = 0; subgraph_idx < model->subgraphs.size();
        subgraph_idx++) {
@@ -937,8 +994,9 @@ TfLiteStatus QuantizeBiases(ModelT* model,
       const BuiltinOperator op_code =
           model->operator_codes[op->opcode_index]->builtin_code;
       const string operator_name = subgraph->tensors[op->outputs[0]]->name;
-      operator_property::OperatorProperty property = GetOperatorProperty(
-          operator_names, model, subgraph_idx, op_idx, operator_name);
+      operator_property::OperatorProperty property =
+          GetOperatorProperty(operator_names, model, subgraph_idx, op_idx,
+                              operator_name, activations_type);
       if (!property.quantizable ||
           !IsRealValueOp(real_value_op_set, operator_name)) {
         continue;
@@ -968,10 +1026,10 @@ TfLiteStatus QuantizeBiases(ModelT* model,
                 subgraph->tensors[op->inputs[property.inputs[1].first]].get();
             operator_property::TensorProperty weight_property =
                 property.inputs[1].second;
-            TF_LITE_ENSURE_STATUS(
-                QuantizeBias(model, input_tensor, weight_tensor, bias_tensor,
-                             weight_property.per_axis,
-                             weight_property.per_axis_index, error_reporter));
+            TF_LITE_ENSURE_STATUS(QuantizeBias(
+                model, input_tensor, weight_tensor, bias_tensor,
+                weight_property.per_axis, weight_property.per_axis_index,
+                activations_type, error_reporter));
           }
         }
       }
@@ -1001,15 +1059,16 @@ std::unordered_set<string> GetAllOperatorOutputs(ModelT* model) {
 TfLiteStatus FillQuantizationParams(
     ModelT* model, const std::unordered_set<string>& operator_names,
     const std::unordered_set<string>& real_value_op_set,
-    ErrorReporter* error_reporter) {
+    const TensorType& activations_type, ErrorReporter* error_reporter) {
   for (size_t subgraph_idx = 0; subgraph_idx < model->subgraphs.size();
        subgraph_idx++) {
     SubGraphT* subgraph = model->subgraphs.at(subgraph_idx).get();
     for (size_t op_idx = 0; op_idx < subgraph->operators.size(); op_idx++) {
       OperatorT* op = subgraph->operators[op_idx].get();
       const string operator_name = subgraph->tensors[op->outputs[0]]->name;
-      operator_property::OperatorProperty property = GetOperatorProperty(
-          operator_names, model, subgraph_idx, op_idx, operator_name);
+      operator_property::OperatorProperty property =
+          GetOperatorProperty(operator_names, model, subgraph_idx, op_idx,
+                              operator_name, activations_type);
       if (!IsRealValueOp(real_value_op_set, operator_name)) {
         continue;
       }
@@ -1110,15 +1169,16 @@ TfLiteStatus FillQuantizationParams(
 TfLiteStatus EnsureBiasScaleCompatibility(
     ModelT* model, const std::unordered_set<string>& operator_names,
     const std::unordered_set<string>& real_value_op_set,
-    ErrorReporter* error_reporter) {
+    const TensorType& activations_type, ErrorReporter* error_reporter) {
   for (size_t subgraph_idx = 0; subgraph_idx < model->subgraphs.size();
        subgraph_idx++) {
     SubGraphT* subgraph = model->subgraphs.at(subgraph_idx).get();
     for (size_t op_idx = 0; op_idx < subgraph->operators.size(); op_idx++) {
       OperatorT* op = subgraph->operators[op_idx].get();
       const string operator_name = subgraph->tensors[op->outputs[0]]->name;
-      operator_property::OperatorProperty property = GetOperatorProperty(
-          operator_names, model, subgraph_idx, op_idx, operator_name);
+      operator_property::OperatorProperty property =
+          GetOperatorProperty(operator_names, model, subgraph_idx, op_idx,
+                              operator_name, activations_type);
       if (!IsRealValueOp(real_value_op_set, operator_name)) {
         continue;
       }
@@ -1166,11 +1226,9 @@ TfLiteStatus EnsureBiasScaleCompatibility(
 
           // Get input scale for asymmetric quantization.
           QuantizationParametersT temp_quant_params = QuantizationParametersT();
-          utils::GetAsymmetricQuantizationParams(
-              input_tensor->quantization->min[0],
-              input_tensor->quantization->max[0],
-              std::numeric_limits<int8_t>::min(),
-              std::numeric_limits<int8_t>::max(), &temp_quant_params);
+          TF_LITE_ENSURE_STATUS(
+              utils::GetQuantizationParams(input_tensor, activations_type,
+                                           &temp_quant_params, error_reporter));
           if (temp_quant_params.scale.size() != 1) {
             TF_LITE_REPORT_ERROR(error_reporter,
                                  "Unexpected input quantization scale size.");
@@ -1256,23 +1314,30 @@ TfLiteStatus QuantizeModel(flatbuffers::FlatBufferBuilder* builder,
                            ModelT* model, const TensorType& input_type,
                            const TensorType& output_type, bool allow_float,
                            const std::unordered_set<string>& operator_names,
+                           const TensorType& activations_type,
                            ErrorReporter* error_reporter) {
-  auto real_value_op_set = PopulateRealValueOpSet(model, operator_names);
-  TF_LITE_ENSURE_STATUS(FillQuantizationParams(
-      model, operator_names, real_value_op_set, error_reporter));
-  TF_LITE_ENSURE_STATUS(EnsureBiasScaleCompatibility(
-      model, operator_names, real_value_op_set, error_reporter));
-  TF_LITE_ENSURE_STATUS(QuantizeIntemediateTensors(model, error_reporter));
+  auto real_value_op_set =
+      PopulateRealValueOpSet(model, operator_names, activations_type);
+  TF_LITE_ENSURE_STATUS(
+      FillQuantizationParams(model, operator_names, real_value_op_set,
+                             activations_type, error_reporter));
+  TF_LITE_ENSURE_STATUS(
+      EnsureBiasScaleCompatibility(model, operator_names, real_value_op_set,
+                                   activations_type, error_reporter));
+  TF_LITE_ENSURE_STATUS(
+      QuantizeIntemediateTensors(model, activations_type, error_reporter));
   TF_LITE_ENSURE_STATUS(QuantizeSharedRange(model, error_reporter));
   TF_LITE_ENSURE_STATUS(QuantizeWeightsInputOutput(
-      model, allow_float, operator_names, real_value_op_set, error_reporter));
+      model, allow_float, operator_names, real_value_op_set, activations_type,
+      error_reporter));
   TF_LITE_ENSURE_STATUS(ApplyConstraints(model, operator_names,
-                                         real_value_op_set, error_reporter));
-  TF_LITE_ENSURE_STATUS(
-      QuantizeBiases(model, operator_names, real_value_op_set, error_reporter));
+                                         real_value_op_set, activations_type,
+                                         error_reporter));
+  TF_LITE_ENSURE_STATUS(QuantizeBiases(model, operator_names, real_value_op_set,
+                                       activations_type, error_reporter));
   utils::SetOperatorCodeVersion(model);
-  TF_LITE_ENSURE_STATUS(
-      SetInputAndOutputTypes(model, input_type, output_type, error_reporter));
+  TF_LITE_ENSURE_STATUS(SetInputAndOutputTypes(
+      model, input_type, output_type, activations_type, error_reporter));
 
   flatbuffers::Offset<Model> output_model_location =
       Model::Pack(*builder, model);
@@ -1281,12 +1346,25 @@ TfLiteStatus QuantizeModel(flatbuffers::FlatBufferBuilder* builder,
   return kTfLiteOk;
 }
 
+TfLiteStatus QuantizeModelAllOperators(flatbuffers::FlatBufferBuilder* builder,
+                                       ModelT* model,
+                                       const TensorType& input_type,
+                                       const TensorType& output_type,
+                                       bool allow_float,
+                                       const TensorType& activations_type,
+                                       ErrorReporter* error_reporter) {
+  return QuantizeModel(builder, model, input_type, output_type, allow_float,
+                       GetAllOperatorOutputs(model), activations_type,
+                       error_reporter);
+}
+
 TfLiteStatus QuantizeModel(flatbuffers::FlatBufferBuilder* builder,
                            ModelT* model, const TensorType& input_type,
                            const TensorType& output_type, bool allow_float,
                            ErrorReporter* error_reporter) {
   return QuantizeModel(builder, model, input_type, output_type, allow_float,
-                       GetAllOperatorOutputs(model), error_reporter);
+                       GetAllOperatorOutputs(model), TensorType_INT8,
+                       error_reporter);
 }
 
 TfLiteStatus QuantizeModel(flatbuffers::FlatBufferBuilder* builder,
diff --git a/tensorflow/lite/tools/optimize/quantize_model.h b/tensorflow/lite/tools/optimize/quantize_model.h
index 9b0353f6b6b..29f581d2b35 100644
--- a/tensorflow/lite/tools/optimize/quantize_model.h
+++ b/tensorflow/lite/tools/optimize/quantize_model.h
@@ -65,6 +65,28 @@ TfLiteStatus QuantizeModel(flatbuffers::FlatBufferBuilder* builder,
                            const std::unordered_set<string>& operator_names,
                            ErrorReporter* error_reporter);
 
+// Same as above, but enables to provide activation type, which
+// could be TensorType_INT16 or TensorType_INT8.
+//
+// Note: This is a private API, subject to change.
+TfLiteStatus QuantizeModelAllOperators(flatbuffers::FlatBufferBuilder* builder,
+                                       ModelT* model,
+                                       const TensorType& input_type,
+                                       const TensorType& output_type,
+                                       bool allow_float,
+                                       const TensorType& activations_type,
+                                       ErrorReporter* error_reporter);
+
+// Quantizes input_model and populates the provided builder with the new model
+// with all possible input parameters.
+// All functions above call this function underneath.
+TfLiteStatus QuantizeModel(flatbuffers::FlatBufferBuilder* builder,
+                           ModelT* model, const TensorType& input_type,
+                           const TensorType& output_type, bool allow_float,
+                           const std::unordered_set<string>& operator_names,
+                           const TensorType& activations_type,
+                           ErrorReporter* error_reporter);
+
 }  // namespace optimize
 }  // namespace tflite
 
diff --git a/tensorflow/lite/tools/optimize/quantize_model_test.cc b/tensorflow/lite/tools/optimize/quantize_model_test.cc
index f8f1a9d4113..0bf6bb8b5a9 100644
--- a/tensorflow/lite/tools/optimize/quantize_model_test.cc
+++ b/tensorflow/lite/tools/optimize/quantize_model_test.cc
@@ -80,28 +80,36 @@ class QuantizeModelTest : public testing::Test {
   internal::FailOnErrorReporter error_reporter_;
 };
 
-class QuantizeConvModelTest : public QuantizeModelTest {
+class QuantizeConvModelTest : public QuantizeModelTest,
+                              public testing::WithParamInterface<TensorType> {
  protected:
   QuantizeConvModelTest() {
+    tensor_type_ = GetParam();
     input_model_ = ReadModel(internal::kConvModelWith0Plus10Weights);
     readonly_model_ = input_model_->GetModel();
     readonly_model_->UnPackTo(&model_);
   }
+  TensorType tensor_type_;
 };
 
-TEST_F(QuantizeConvModelTest, QuantizationSucceeds) {
-  auto status = QuantizeModel(&builder_, &model_, TensorType_INT8,
-                              TensorType_INT8, &error_reporter_);
+INSTANTIATE_TEST_SUITE_P(QuantizeConvModelTestInst, QuantizeConvModelTest,
+                         testing::ValuesIn({TensorType_INT8,
+                                            TensorType_INT16}));
+
+TEST_P(QuantizeConvModelTest, QuantizationSucceeds) {
+  auto status =
+      QuantizeModelAllOperators(&builder_, &model_, tensor_type_, tensor_type_,
+                                false, tensor_type_, &error_reporter_);
   EXPECT_EQ(status, kTfLiteOk);
   const uint8_t* buffer = builder_.GetBufferPointer();
   const Model* output_model = GetModel(buffer);
   ASSERT_TRUE(output_model);
 }
 
-TEST_F(QuantizeConvModelTest, SkipUnspecifiedLayer) {
-  auto status =
-      QuantizeModel(&builder_, &model_, TensorType_FLOAT32, TensorType_FLOAT32,
-                    /*allow_float=*/true, {}, &error_reporter_);
+TEST_P(QuantizeConvModelTest, SkipUnspecifiedLayer) {
+  auto status = QuantizeModel(
+      &builder_, &model_, TensorType_FLOAT32, TensorType_FLOAT32,
+      /*allow_float=*/true, {}, TensorType_FLOAT32, &error_reporter_);
   EXPECT_EQ(status, kTfLiteOk);
   ASSERT_EQ(model_.subgraphs.size(), readonly_model_->subgraphs()->size());
   // The resulting model should be the same.
@@ -123,9 +131,10 @@ TEST_F(QuantizeConvModelTest, SkipUnspecifiedLayer) {
   }
 }
 
-TEST_F(QuantizeConvModelTest, TensorShapesAndStructureIsUnchanged) {
-  auto status = QuantizeModel(&builder_, &model_, TensorType_INT8,
-                              TensorType_INT8, &error_reporter_);
+TEST_P(QuantizeConvModelTest, TensorShapesAndStructureIsUnchanged) {
+  auto status =
+      QuantizeModelAllOperators(&builder_, &model_, tensor_type_, tensor_type_,
+                                false, tensor_type_, &error_reporter_);
   EXPECT_EQ(status, kTfLiteOk);
   ASSERT_EQ(model_.subgraphs.size(), readonly_model_->subgraphs()->size());
   for (size_t subgraph_idx = 0; subgraph_idx < model_.subgraphs.size();
@@ -148,9 +157,10 @@ TEST_F(QuantizeConvModelTest, TensorShapesAndStructureIsUnchanged) {
   EXPECT_EQ(model_.operator_codes[0]->version, 3);
 }
 
-TEST_F(QuantizeConvModelTest, OperatorsAreUnchanged) {
-  auto status = QuantizeModel(&builder_, &model_, TensorType_INT8,
-                              TensorType_INT8, &error_reporter_);
+TEST_P(QuantizeConvModelTest, OperatorsAreUnchanged) {
+  auto status =
+      QuantizeModelAllOperators(&builder_, &model_, tensor_type_, tensor_type_,
+                                false, tensor_type_, &error_reporter_);
   EXPECT_EQ(status, kTfLiteOk);
   ASSERT_EQ(model_.operator_codes.size(),
             readonly_model_->operator_codes()->size());
@@ -182,20 +192,29 @@ TEST_F(QuantizeConvModelTest, OperatorsAreUnchanged) {
   }
 }
 
-TEST_F(QuantizeConvModelTest, GraphIsFullyQuantized) {
-  auto status = QuantizeModel(&builder_, &model_, TensorType_INT8,
-                              TensorType_INT8, &error_reporter_);
+TEST_P(QuantizeConvModelTest, GraphIsFullyQuantized) {
+  auto status = QuantizeModelAllOperators(
+      &builder_, &model_, tensor_type_, tensor_type_,
+      /*allow_float*/ false, tensor_type_, &error_reporter_);
   EXPECT_EQ(status, kTfLiteOk);
   for (const auto& subgraph : model_.subgraphs) {
     for (const auto& tensor : subgraph->tensors) {
-      EXPECT_TRUE(tensor->type == TensorType_INT32 ||
-                  tensor->type == TensorType_INT8);
+      if (tensor_type_ == TensorType_INT8) {
+        EXPECT_TRUE(tensor->type == TensorType_INT32 ||
+                    tensor->type == TensorType_INT8);
+      } else if (tensor_type_ == TensorType_INT16) {
+        EXPECT_TRUE(tensor->type == TensorType_INT64 ||  // bias
+                    tensor->type == TensorType_INT8 ||   // weights
+                    tensor->type == TensorType_INT16);   // activations
+      }
     }
   }
 }
 
-TEST_F(QuantizeConvModelTest, FloatInputAndOutput) {
-  auto status = QuantizeModel(&builder_, &model_, &error_reporter_);
+TEST_P(QuantizeConvModelTest, FloatInputAndOutput) {
+  auto status = QuantizeModelAllOperators(
+      &builder_, &model_, TensorType_FLOAT32, TensorType_FLOAT32,
+      /*allow_float*/ false, tensor_type_, &error_reporter_);
   EXPECT_EQ(status, kTfLiteOk);
 
   for (int32_t subgraph_idx = 0; subgraph_idx < model_.subgraphs.size();
@@ -234,22 +253,33 @@ TEST_F(QuantizeConvModelTest, FloatInputAndOutput) {
     EXPECT_EQ(subgraph->tensors[output_idx]->type, TensorType_FLOAT32);
     EXPECT_EQ(subgraph->tensors[output_idx]->name, "output");
     // The original input and output has been renamed.
-    EXPECT_EQ(subgraph->tensors[quant_op->outputs[0]]->name, "input_int8");
-    EXPECT_EQ(subgraph->tensors[dequant_op->inputs[0]]->name, "output_int8");
+    std::string control_suffix =
+        (tensor_type_ == TensorType_INT16) ? "int16" : "int8";
+    EXPECT_EQ(subgraph->tensors[quant_op->outputs[0]]->name,
+              "input_" + control_suffix);
+    EXPECT_EQ(subgraph->tensors[dequant_op->inputs[0]]->name,
+              "output_" + control_suffix);
     for (int tensor_idx = 0; tensor_idx < subgraph->tensors.size();
          ++tensor_idx) {
       const auto& tensor = subgraph->tensors[tensor_idx];
       if (input_idx != tensor_idx && output_idx != tensor_idx) {
-        EXPECT_TRUE(tensor->type == TensorType_INT32 ||
-                    tensor->type == TensorType_INT8);
+        if (tensor_type_ == TensorType_INT8) {
+          EXPECT_TRUE(tensor->type == TensorType_INT32 ||
+                      tensor->type == TensorType_INT8);
+        } else if (tensor_type_ == TensorType_INT16) {
+          EXPECT_TRUE(tensor->type == TensorType_INT64 ||  // bias
+                      tensor->type == TensorType_INT8 ||   // weights
+                      tensor->type == TensorType_INT16);   // activations
+        }
       }
     }
   }
 }
 
-TEST_F(QuantizeConvModelTest, Uint8InputAndOutput) {
-  auto status = QuantizeModel(&builder_, &model_, TensorType_UINT8,
-                              TensorType_UINT8, &error_reporter_);
+TEST_P(QuantizeConvModelTest, Uint8InputAndOutput) {
+  auto status = QuantizeModelAllOperators(&builder_, &model_, TensorType_UINT8,
+                                          TensorType_UINT8, false,
+                                          TensorType_INT8, &error_reporter_);
   EXPECT_EQ(status, kTfLiteOk);
 
   for (int32_t subgraph_idx = 0; subgraph_idx < model_.subgraphs.size();
@@ -326,21 +356,27 @@ class QuantizeConvNoBiasModelTest : public QuantizeModelTest {
 };
 
 TEST_F(QuantizeConvNoBiasModelTest, QuantizationSucceeds) {
-  auto status = QuantizeModel(&builder_, &model_, TensorType_INT8,
-                              TensorType_INT8, &error_reporter_);
+  auto status = QuantizeModelAllOperators(&builder_, &model_, TensorType_INT8,
+                                          TensorType_INT8, false,
+                                          TensorType_INT8, &error_reporter_);
   EXPECT_EQ(status, kTfLiteOk);
   const uint8_t* buffer = builder_.GetBufferPointer();
   const Model* output_model = GetModel(buffer);
   ASSERT_TRUE(output_model);
 }
 
-class QuantizeConcatModelTest : public QuantizeModelTest {
+class QuantizeConcatModelTest : public QuantizeModelTest,
+                                public testing::WithParamInterface<TensorType> {
  protected:
   QuantizeConcatModelTest() {
     input_model_ = ReadModel(internal::kFloatConcatMax5Max10Max10);
     readonly_model_ = input_model_->GetModel();
     readonly_model_->UnPackTo(&model_);
   }
+
+  void SetUp() override { tensor_type_ = GetParam(); }
+
+  TensorType tensor_type_;
 };
 
 // There are two inputs for concat, "input0" and "input1". "input0" has [0, 5]
@@ -352,9 +388,10 @@ class QuantizeConcatModelTest : public QuantizeModelTest {
 // input0 -> requant -> input0_requant \
 //                                       concat - output
 //                              input1 /
-TEST_F(QuantizeConcatModelTest, AddRequantBeforeConcat) {
-  auto status = QuantizeModel(&builder_, &model_, TensorType_INT8,
-                              TensorType_INT8, &error_reporter_);
+TEST_P(QuantizeConcatModelTest, AddRequantBeforeConcat) {
+  auto status =
+      QuantizeModelAllOperators(&builder_, &model_, tensor_type_, tensor_type_,
+                                false, tensor_type_, &error_reporter_);
   EXPECT_EQ(status, kTfLiteOk);
 
   // There is only one subgraph.
@@ -373,32 +410,51 @@ TEST_F(QuantizeConcatModelTest, AddRequantBeforeConcat) {
   EXPECT_EQ(model_.operator_codes[concat->opcode_index]->builtin_code,
             BuiltinOperator_CONCATENATION);
 
+  auto zero_point_control = tensor_type_ == TensorType_INT8 ? -128 : 0;
+  /*
+     input0_scale_control
+        INT8: (5-0) / (2^8 - 1)
+        INT16: (5-0) / (2^16 / 2 - 1)
+     input1_scale
+        INT8: (10-0) / (2^8 - 1)
+        INT16: (10-0) / (2^16 / 2 - 1)
+  */
+  auto input0_scale_control =
+      tensor_type_ == TensorType_INT8 ? 0.019607844 : 0.00015259254;
+  auto input1_scale =
+      tensor_type_ == TensorType_INT8 ? 0.039215688 : 0.00030518509;
+
   // There should be 4 tensors: input0, input1, input0_requantized, output.
   EXPECT_EQ(subgraph->tensors.size(), 4);
-  EXPECT_EQ(subgraph->tensors[0]->type, TensorType_INT8);
+  EXPECT_EQ(subgraph->tensors[0]->type, tensor_type_);
   EXPECT_EQ(subgraph->tensors[0]->name, "input0");
   EXPECT_EQ(subgraph->tensors[0]->quantization->scale.size(), 1);
   EXPECT_EQ(subgraph->tensors[0]->quantization->zero_point.size(), 1);
-  EXPECT_FLOAT_EQ(subgraph->tensors[0]->quantization->scale[0], 0.019607844);
-  EXPECT_FLOAT_EQ(subgraph->tensors[0]->quantization->zero_point[0], -128);
-  EXPECT_EQ(subgraph->tensors[1]->type, TensorType_INT8);
+  EXPECT_FLOAT_EQ(subgraph->tensors[0]->quantization->scale[0],
+                  input0_scale_control);
+  EXPECT_FLOAT_EQ(subgraph->tensors[0]->quantization->zero_point[0],
+                  zero_point_control);
+  EXPECT_EQ(subgraph->tensors[1]->type, tensor_type_);
   EXPECT_EQ(subgraph->tensors[1]->name, "input1");
   EXPECT_EQ(subgraph->tensors[1]->quantization->scale.size(), 1);
   EXPECT_EQ(subgraph->tensors[1]->quantization->zero_point.size(), 1);
-  EXPECT_FLOAT_EQ(subgraph->tensors[1]->quantization->scale[0], 0.039215688);
-  EXPECT_FLOAT_EQ(subgraph->tensors[1]->quantization->zero_point[0], -128);
-  EXPECT_EQ(subgraph->tensors[2]->type, TensorType_INT8);
+  EXPECT_FLOAT_EQ(subgraph->tensors[1]->quantization->scale[0], input1_scale);
+  EXPECT_FLOAT_EQ(subgraph->tensors[1]->quantization->zero_point[0],
+                  zero_point_control);
+  EXPECT_EQ(subgraph->tensors[2]->type, tensor_type_);
   EXPECT_EQ(subgraph->tensors[2]->name, "output");
   EXPECT_EQ(subgraph->tensors[2]->quantization->scale.size(), 1);
   EXPECT_EQ(subgraph->tensors[2]->quantization->zero_point.size(), 1);
-  EXPECT_FLOAT_EQ(subgraph->tensors[2]->quantization->scale[0], 0.039215688);
-  EXPECT_FLOAT_EQ(subgraph->tensors[2]->quantization->zero_point[0], -128);
-  EXPECT_EQ(subgraph->tensors[3]->type, TensorType_INT8);
+  EXPECT_FLOAT_EQ(subgraph->tensors[2]->quantization->scale[0], input1_scale);
+  EXPECT_FLOAT_EQ(subgraph->tensors[2]->quantization->zero_point[0],
+                  zero_point_control);
+  EXPECT_EQ(subgraph->tensors[3]->type, tensor_type_);
   EXPECT_EQ(subgraph->tensors[3]->name, "input0_requantized");
   EXPECT_EQ(subgraph->tensors[3]->quantization->scale.size(), 1);
   EXPECT_EQ(subgraph->tensors[3]->quantization->zero_point.size(), 1);
-  EXPECT_FLOAT_EQ(subgraph->tensors[3]->quantization->scale[0], 0.039215688);
-  EXPECT_FLOAT_EQ(subgraph->tensors[3]->quantization->zero_point[0], -128);
+  EXPECT_FLOAT_EQ(subgraph->tensors[3]->quantization->scale[0], input1_scale);
+  EXPECT_FLOAT_EQ(subgraph->tensors[3]->quantization->zero_point[0],
+                  zero_point_control);
 
   // The connection should be what is described in the comment.
   EXPECT_EQ(requant->inputs.size(), 1);
@@ -419,7 +475,9 @@ TEST_F(QuantizeConcatModelTest, AddRequantBeforeConcat) {
   EXPECT_EQ(model_.operator_codes[1]->builtin_code, BuiltinOperator_QUANTIZE);
   EXPECT_EQ(model_.operator_codes[1]->version, 2);
 }
-
+INSTANTIATE_TEST_SUITE_P(QuantizeConcatModelInst, QuantizeConcatModelTest,
+                         testing::ValuesIn({TensorType_INT8,
+                                            TensorType_INT16}));
 class QuantizeSplitModelTest : public QuantizeModelTest {
  protected:
   QuantizeSplitModelTest() {
@@ -432,8 +490,9 @@ class QuantizeSplitModelTest : public QuantizeModelTest {
 // There are two outputs for split with different scales, the resulting model
 // should have the scales be hardcodes to the input scale value.
 TEST_F(QuantizeSplitModelTest, QuantizeSplit) {
-  auto status = QuantizeModel(&builder_, &model_, TensorType_INT8,
-                              TensorType_INT8, &error_reporter_);
+  auto status = QuantizeModelAllOperators(&builder_, &model_, TensorType_INT8,
+                                          TensorType_INT8, false,
+                                          TensorType_INT8, &error_reporter_);
   EXPECT_EQ(status, kTfLiteOk);
 
   // There is only one subgraph.
@@ -496,8 +555,9 @@ class QuantizeConvModel1Test : public QuantizeModelTest {
 };
 
 TEST_F(QuantizeConvModel1Test, VerifyConvQuantizationWithUnitScale) {
-  auto status = QuantizeModel(&builder_, &model_, TensorType_INT8,
-                              TensorType_INT8, &error_reporter_);
+  auto status = QuantizeModelAllOperators(&builder_, &model_, TensorType_INT8,
+                                          TensorType_INT8, false,
+                                          TensorType_INT8, &error_reporter_);
   EXPECT_EQ(status, kTfLiteOk);
   const auto& subgraph = model_.subgraphs[0];
 
@@ -587,18 +647,26 @@ TEST_F(QuantizeConvModel1Test, VerifyConvQuantizationWithUnitScale) {
   EXPECT_EQ(model_.operator_codes[0]->version, 3);
 }
 
-class QuantizeConvModel2Test : public QuantizeModelTest {
+class QuantizeConvModel2Test : public QuantizeModelTest,
+                               public testing::WithParamInterface<TensorType> {
  protected:
   QuantizeConvModel2Test() {
+    tensor_type_ = GetParam();
     input_model_ = ReadModel(internal::kConvModelWith0Plus10Weights);
     readonly_model_ = input_model_->GetModel();
     readonly_model_->UnPackTo(&model_);
   }
-};
 
-TEST_F(QuantizeConvModel2Test, VerifyConvQuantization) {
-  auto status = QuantizeModel(&builder_, &model_, TensorType_INT8,
-                              TensorType_INT8, &error_reporter_);
+  TensorType tensor_type_;
+};
+INSTANTIATE_TEST_SUITE_P(QuantizeConvModel2TestInst, QuantizeConvModel2Test,
+                         testing::ValuesIn({TensorType_INT8,
+                                            TensorType_INT16}));
+
+TEST_P(QuantizeConvModel2Test, VerifyConvQuantization) {
+  auto status =
+      QuantizeModelAllOperators(&builder_, &model_, tensor_type_, tensor_type_,
+                                false, tensor_type_, &error_reporter_);
   ASSERT_EQ(kTfLiteOk, status);
   const auto& subgraph = model_.subgraphs[0];
   auto conv_op = subgraph->operators[0].get();
@@ -615,8 +683,10 @@ TEST_F(QuantizeConvModel2Test, VerifyConvQuantization) {
   const auto output_tensor =
       subgraph->tensors[conv_op->outputs[output_tensor_idx]].get();
 
-  EXPECT_EQ(bias_tensor->type, TensorType_INT32);
-  EXPECT_EQ(input_tensor->type, TensorType_INT8);
+  EXPECT_EQ(bias_tensor->type, tensor_type_ == TensorType_INT8
+                                   ? TensorType_INT32
+                                   : TensorType_INT64);
+  EXPECT_EQ(input_tensor->type, tensor_type_);
   EXPECT_EQ(weights_tensor->type, TensorType_INT8);
 
   ASSERT_TRUE(weights_tensor->quantization);
@@ -644,17 +714,28 @@ TEST_F(QuantizeConvModel2Test, VerifyConvQuantization) {
   }
 
   const auto bias_buffer = model_.buffers[bias_tensor->buffer].get();
-  ASSERT_EQ(bias_buffer->data.size(), sizeof(int32_t) * bias_tensor->shape[0]);
-  const int32_t* bias_values =
-      reinterpret_cast<int32_t*>(bias_buffer->data.data());
+  auto control_size = tensor_type_ == TensorType_INT8
+                          ? sizeof(int32_t) * bias_tensor->shape[0]
+                          : sizeof(int64_t) * bias_tensor->shape[0];
+
+  ASSERT_EQ(bias_buffer->data.size(), control_size);
   const auto original_bias_buffer =
       readonly_model_->buffers()->Get(bias_tensor->buffer);
   const float* bias_float_buffer =
       reinterpret_cast<const float*>(original_bias_buffer->data()->data());
 
-  for (size_t i = 0; i < out_channel_size; i++) {
-    auto dequantized_value = bias_values[i] * bias_scales[i];
-    EXPECT_NEAR(dequantized_value, bias_float_buffer[i], bias_scales[i] / 2);
+  if (tensor_type_ == TensorType_INT8) {
+    int32_t* bias_values = reinterpret_cast<int32_t*>(bias_buffer->data.data());
+    for (size_t i = 0; i < out_channel_size; i++) {
+      auto dequantized_value = bias_values[i] * bias_scales[i];
+      EXPECT_NEAR(dequantized_value, bias_float_buffer[i], bias_scales[i] / 2);
+    }
+  } else if (tensor_type_ == TensorType_INT16) {
+    int64_t* bias_values = reinterpret_cast<int64_t*>(bias_buffer->data.data());
+    for (size_t i = 0; i < out_channel_size; i++) {
+      auto dequantized_value = bias_values[i] * bias_scales[i];
+      EXPECT_NEAR(dequantized_value, bias_float_buffer[i], bias_scales[i] / 2);
+    }
   }
 
   const auto weights_buffer = model_.buffers[weights_tensor->buffer].get();
@@ -695,8 +776,9 @@ class QuantizeSoftmaxTest : public QuantizeModelTest {
 };
 
 TEST_F(QuantizeSoftmaxTest, VerifySoftmaxQuantization) {
-  auto status = QuantizeModel(&builder_, &model_, TensorType_INT8,
-                              TensorType_INT8, &error_reporter_);
+  auto status = QuantizeModelAllOperators(&builder_, &model_, TensorType_INT8,
+                                          TensorType_INT8, false,
+                                          TensorType_INT8, &error_reporter_);
   ASSERT_EQ(kTfLiteOk, status);
 
   const auto& subgraph = model_.subgraphs[0];
@@ -755,8 +837,9 @@ class QuantizeAvgPoolTest : public QuantizeModelTest {
 };
 
 TEST_F(QuantizeAvgPoolTest, VerifyAvgPoolQuantization) {
-  auto status = QuantizeModel(&builder_, &model_, TensorType_INT8,
-                              TensorType_INT8, &error_reporter_);
+  auto status = QuantizeModelAllOperators(&builder_, &model_, TensorType_INT8,
+                                          TensorType_INT8, false,
+                                          TensorType_INT8, &error_reporter_);
   ASSERT_EQ(kTfLiteOk, status);
 
   const auto& subgraph = model_.subgraphs[0];
@@ -816,8 +899,9 @@ class QuantizeMultiInputAddWithReshapeTest : public QuantizeModelTest {
 };
 
 TEST_F(QuantizeMultiInputAddWithReshapeTest, VerifyReshapeQuantization) {
-  auto status = QuantizeModel(&builder_, &model_, TensorType_INT8,
-                              TensorType_INT8, &error_reporter_);
+  auto status = QuantizeModelAllOperators(&builder_, &model_, TensorType_INT8,
+                                          TensorType_INT8, false,
+                                          TensorType_INT8, &error_reporter_);
   ASSERT_EQ(kTfLiteOk, status);
 
   // Verify Reshape is quantized.
@@ -863,8 +947,9 @@ TEST_F(QuantizeMultiInputAddWithReshapeTest, VerifyReshapeQuantization) {
 }
 
 TEST_F(QuantizeMultiInputAddWithReshapeTest, VerifyAddQuantization) {
-  auto status = QuantizeModel(&builder_, &model_, TensorType_INT8,
-                              TensorType_INT8, &error_reporter_);
+  auto status = QuantizeModelAllOperators(&builder_, &model_, TensorType_INT8,
+                                          TensorType_INT8, false,
+                                          TensorType_INT8, &error_reporter_);
   ASSERT_EQ(kTfLiteOk, status);
 
   // Verify ADD is quantized.
@@ -923,8 +1008,9 @@ class QuantizeConstInputTest : public QuantizeModelTest {
 };
 
 TEST_F(QuantizeConstInputTest, VerifyConstOpInput) {
-  auto status = QuantizeModel(&builder_, &model_, TensorType_INT8,
-                              TensorType_INT8, &error_reporter_);
+  auto status = QuantizeModelAllOperators(&builder_, &model_, TensorType_INT8,
+                                          TensorType_INT8, false,
+                                          TensorType_INT8, &error_reporter_);
   ASSERT_EQ(kTfLiteOk, status);
 
   // Verify ConstOp is quantized.
@@ -965,8 +1051,9 @@ class QuantizeArgMaxTest : public QuantizeModelTest {
 };
 
 TEST_F(QuantizeArgMaxTest, VerifyArgMax) {
-  auto status = QuantizeModel(&builder_, &model_, TensorType_INT8,
-                              TensorType_INT8, &error_reporter_);
+  auto status = QuantizeModelAllOperators(&builder_, &model_, TensorType_INT8,
+                                          TensorType_INT8, false,
+                                          TensorType_INT8, &error_reporter_);
   ASSERT_EQ(kTfLiteOk, status);
 
   const auto& subgraph = model_.subgraphs[0];
@@ -1008,8 +1095,9 @@ class QuantizeLSTMTest : public QuantizeModelTest {
 
 TEST_F(QuantizeLSTMTest, VerifyLSTM) {
   // Quantize model.
-  auto status = QuantizeModel(&builder_, &model_, TensorType_FLOAT32,
-                              TensorType_FLOAT32, &error_reporter_);
+  auto status = QuantizeModelAllOperators(
+      &builder_, &model_, TensorType_FLOAT32, TensorType_FLOAT32, false,
+      TensorType_INT8, &error_reporter_);
   ASSERT_EQ(kTfLiteOk, status);
 
   // Read expected model.
@@ -1067,8 +1155,9 @@ class QuantizeLSTM2Test : public QuantizeModelTest {
 
 TEST_F(QuantizeLSTM2Test, VerifyLSTM) {
   // Quantize model.
-  auto status = QuantizeModel(&builder_, &model_, TensorType_FLOAT32,
-                              TensorType_FLOAT32, &error_reporter_);
+  auto status = QuantizeModelAllOperators(
+      &builder_, &model_, TensorType_FLOAT32, TensorType_FLOAT32, false,
+      TensorType_INT8, &error_reporter_);
   ASSERT_EQ(kTfLiteOk, status);
 
   // Read expected model.
@@ -1126,8 +1215,9 @@ class QuantizeSVDFTest : public QuantizeModelTest {
 
 TEST_F(QuantizeSVDFTest, VerifySVDF) {
   // Quantize model.
-  auto status = QuantizeModel(&builder_, &model_, TensorType_INT8,
-                              TensorType_INT8, &error_reporter_);
+  auto status = QuantizeModelAllOperators(&builder_, &model_, TensorType_INT8,
+                                          TensorType_INT8, false,
+                                          TensorType_INT8, &error_reporter_);
   ASSERT_EQ(kTfLiteOk, status);
 
   // Read expected model.
@@ -1184,8 +1274,9 @@ class QuantizeFCTest : public QuantizeModelTest {
 };
 
 TEST_F(QuantizeFCTest, VerifyFC) {
-  auto status = QuantizeModel(&builder_, &model_, TensorType_INT8,
-                              TensorType_INT8, &error_reporter_);
+  auto status = QuantizeModelAllOperators(&builder_, &model_, TensorType_INT8,
+                                          TensorType_INT8, false,
+                                          TensorType_INT8, &error_reporter_);
   ASSERT_EQ(kTfLiteOk, status);
 
   const auto& subgraph = model_.subgraphs[0];
@@ -1224,7 +1315,9 @@ TEST_F(QuantizeFCTest, VerifyFC) {
   EXPECT_EQ(model_.operator_codes[1]->version, 1);
 }
 
-class QuantizeCustomOpTest : public QuantizeModelTest {
+class QuantizeCustomOpTest
+    : public QuantizeModelTest,
+      public ::testing::WithParamInterface<tflite::TensorType> {
  protected:
   QuantizeCustomOpTest() {
     input_model_ = ReadModel(internal::kModelMixed);
@@ -1233,10 +1326,10 @@ class QuantizeCustomOpTest : public QuantizeModelTest {
   }
 };
 
-TEST_F(QuantizeCustomOpTest, VerifyMixedQuantization) {
-  auto status =
-      QuantizeModel(&builder_, &model_, TensorType_INT8, TensorType_INT8,
-                    /*allow_float=*/true, &error_reporter_);
+TEST_P(QuantizeCustomOpTest, VerifyMixedQuantization) {
+  auto status = QuantizeModelAllOperators(
+      &builder_, &model_, GetParam(), GetParam(),
+      /*allow_float=*/true, GetParam(), &error_reporter_);
   ASSERT_EQ(kTfLiteOk, status);
   const auto& subgraph = model_.subgraphs[0];
   auto float_graph = readonly_model_->subgraphs()->Get(0);
@@ -1250,8 +1343,45 @@ TEST_F(QuantizeCustomOpTest, VerifyMixedQuantization) {
       BuiltinOperator_CUSTOM,   BuiltinOperator_CUSTOM,
       BuiltinOperator_QUANTIZE, BuiltinOperator_SQUEEZE};
   const std::vector<TensorType> op_input_types = {
-      TensorType_INT8,    TensorType_INT8,    TensorType_FLOAT32,
-      TensorType_FLOAT32, TensorType_FLOAT32, TensorType_INT8};
+      GetParam(),         GetParam(),         TensorType_FLOAT32,
+      TensorType_FLOAT32, TensorType_FLOAT32, GetParam()};
+  for (int i = 0; i < subgraph->operators.size(); ++i) {
+    OperatorT* op = subgraph->operators[i].get();
+    ASSERT_EQ(model_.operator_codes[op->opcode_index]->builtin_code,
+              op_codes[i]);
+    ASSERT_EQ(subgraph->tensors[op->inputs[0]]->type, op_input_types[i]);
+  }
+}
+
+INSTANTIATE_TEST_SUITE_P(QuantizeCustomOpTest, QuantizeCustomOpTest,
+                         ::testing::Values(TensorType_INT8, TensorType_INT16));
+
+class QuantizeOp16x8Test : public QuantizeModelTest {
+ protected:
+  QuantizeOp16x8Test() {
+    input_model_ = ReadModel(internal::kModelMixed16x8);
+    readonly_model_ = input_model_->GetModel();
+    readonly_model_->UnPackTo(&model_);
+  }
+};
+
+TEST_F(QuantizeOp16x8Test, VerifyMixedQuantization16x8) {
+  auto status = QuantizeModelAllOperators(
+      &builder_, &model_, TensorType_INT16, TensorType_FLOAT32,
+      /*allow_float=*/true, TensorType_INT16, &error_reporter_);
+  ASSERT_EQ(kTfLiteOk, status);
+  const auto& subgraph = model_.subgraphs[0];
+  auto float_graph = readonly_model_->subgraphs()->Get(0);
+  // The original model conv_2d->log_softmax
+  ASSERT_EQ(float_graph->operators()->size(), 2);
+  // The resulting model should be:
+  // conv_2d->dequantize->log_softmax
+  ASSERT_EQ(subgraph->operators.size(), 3);
+  const std::vector<BuiltinOperator> op_codes = {BuiltinOperator_CONV_2D,
+                                                 BuiltinOperator_DEQUANTIZE,
+                                                 BuiltinOperator_LOG_SOFTMAX};
+  const std::vector<TensorType> op_input_types = {
+      TensorType_INT16, TensorType_INT16, TensorType_FLOAT32};
   for (int i = 0; i < subgraph->operators.size(); ++i) {
     OperatorT* op = subgraph->operators[i].get();
     ASSERT_EQ(model_.operator_codes[op->opcode_index]->builtin_code,
@@ -1454,6 +1584,50 @@ TEST_F(QuantizeUnpackTest, VerifyUnpack) {
                   unpack_output_1->quantization->zero_point[0]);
 }
 
+class QuantizeTransposeTest : public QuantizeModelTest {
+ protected:
+  QuantizeTransposeTest() {
+    input_model_ = ReadModel(internal::kModelWithTranspose);
+    readonly_model_ = input_model_->GetModel();
+    readonly_model_->UnPackTo(&model_);
+  }
+};
+
+TEST_F(QuantizeTransposeTest, VerifyTranspose) {
+  auto status = QuantizeModel(&builder_, &model_, &error_reporter_);
+
+  ASSERT_EQ(kTfLiteOk, status);
+
+  const auto subgraph = model_.subgraphs[0].get();
+  auto op = subgraph->operators[1].get();
+
+  auto float_graph = readonly_model_->subgraphs()->Get(0);
+
+  ASSERT_EQ(model_.operator_codes[op->opcode_index].get()->builtin_code,
+            BuiltinOperator_TRANSPOSE);
+
+  // The model should only have one input and one outputs.
+  EXPECT_EQ(subgraph->inputs.size(), 1);
+  EXPECT_EQ(subgraph->outputs.size(), 1);
+
+  // Get transpose input and output tensors
+  auto transpose_input = subgraph->tensors[op->inputs[0]].get();
+  auto transpose_output = subgraph->tensors[op->outputs[0]].get();
+
+  // Verify transpose input is quantized.
+  ASSERT_EQ(float_graph->tensors()->Get(op->inputs[0])->type(),
+            TensorType_FLOAT32);
+  EXPECT_EQ(transpose_input->type, TensorType_INT8);
+
+  // Ensure quantization parameters before and after transpose
+  // are preserved after quantization for all outputs of
+  // transpose.
+  EXPECT_FLOAT_EQ(transpose_input->quantization->scale[0],
+                  transpose_output->quantization->scale[0]);
+  EXPECT_EQ(transpose_input->quantization->zero_point[0],
+            transpose_output->quantization->zero_point[0]);
+}
+
 }  // namespace
 }  // namespace optimize
 }  // namespace tflite
diff --git a/tensorflow/lite/tools/optimize/quantize_weights.cc b/tensorflow/lite/tools/optimize/quantize_weights.cc
index 7e3853c645c..8bef019a83e 100644
--- a/tensorflow/lite/tools/optimize/quantize_weights.cc
+++ b/tensorflow/lite/tools/optimize/quantize_weights.cc
@@ -43,6 +43,12 @@ typedef struct {
   int32_t op_input_idx;
 } ConsumerOpInfo;
 
+typedef struct {
+  TensorT* t;
+  bool is_per_channel;
+  int channel_dim;
+} TensorPerChannel;
+
 // The default minimum number of elements a weights array must have to be
 // quantized by this transformation.
 const int kWeightsMinNumElementsDefault = 1024;
@@ -138,6 +144,7 @@ bool IsHybridEvaluationOp(const OperatorT* op, const OperatorCodeT* op_code,
     }
   } else if (builtin_op_code == BuiltinOperator_FULLY_CONNECTED ||
              builtin_op_code == BuiltinOperator_CONV_2D ||
+             builtin_op_code == BuiltinOperator_DEPTHWISE_CONV_2D ||
              builtin_op_code == BuiltinOperator_SVDF ||
              builtin_op_code == BuiltinOperator_RNN ||
              builtin_op_code == BuiltinOperator_BIDIRECTIONAL_SEQUENCE_LSTM ||
@@ -181,9 +188,10 @@ bool CheckAllOpInputsQuantized(const SubGraphT* subgraph, const OperatorT* op,
 // Inserts Tensors for each input tensor of op that should be
 // quantized into tensor_map.
 TfLiteStatus InsertQuantizableInputTensorsFromOperator(
-    const ModelT* model, const OperatorT* op, uint64_t weights_min_num_elements,
+    const ModelT* model, OperatorT* op, uint64_t weights_min_num_elements,
     const CustomOpMap& custom_op_map,
-    absl::flat_hash_map<int32_t, TensorT*>* tensor_map, int subgraph_index) {
+    absl::flat_hash_map<int32_t, TensorPerChannel>* tensor_map,
+    int subgraph_index) {
   SubGraphT* subgraph = model->subgraphs.at(subgraph_index).get();
   const OperatorCodeT* op_code = model->operator_codes[op->opcode_index].get();
 
@@ -222,7 +230,50 @@ TfLiteStatus InsertQuantizableInputTensorsFromOperator(
       continue;
     }
 
-    tensor_map->insert({tensor_idx, tensor});
+    if (op_code->builtin_code == BuiltinOperator_DEPTHWISE_CONV_2D) {
+      tensor_map->insert(
+          {tensor_idx, {tensor, /*is_per_channel=*/true, /*dim=*/3}});
+    } else if (op_code->builtin_code == BuiltinOperator_CONV_2D) {
+      tensor_map->insert(
+          {tensor_idx, {tensor, /*is_per_channel=*/true, /*dim=*/0}});
+    } else {
+      switch (op_code->builtin_code) {
+        case BuiltinOperator_BIDIRECTIONAL_SEQUENCE_LSTM:
+          op->builtin_options.AsBidirectionalSequenceLSTMOptions()
+              ->asymmetric_quantize_inputs = true;
+          break;
+        case BuiltinOperator_BIDIRECTIONAL_SEQUENCE_RNN:
+          op->builtin_options.AsBidirectionalSequenceRNNOptions()
+              ->asymmetric_quantize_inputs = true;
+          break;
+        case BuiltinOperator_FULLY_CONNECTED:
+          op->builtin_options.AsFullyConnectedOptions()
+              ->asymmetric_quantize_inputs = true;
+          break;
+        case BuiltinOperator_LSTM:
+          op->builtin_options.AsLSTMOptions()->asymmetric_quantize_inputs =
+              true;
+          break;
+        case BuiltinOperator_RNN:
+          op->builtin_options.AsRNNOptions()->asymmetric_quantize_inputs = true;
+          break;
+        case BuiltinOperator_SVDF:
+          op->builtin_options.AsSVDFOptions()->asymmetric_quantize_inputs =
+              true;
+          break;
+        case BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_LSTM:
+          op->builtin_options.AsUnidirectionalSequenceLSTMOptions()
+              ->asymmetric_quantize_inputs = true;
+          break;
+        case BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_RNN:
+          op->builtin_options.AsSequenceRNNOptions()
+              ->asymmetric_quantize_inputs = true;
+          break;
+        default:
+          break;
+      }
+      tensor_map->insert({tensor_idx, {tensor, /*is_per_channel=*/false}});
+    }
   }
 
   return kTfLiteOk;
@@ -275,17 +326,22 @@ void MakeTensor(const string& name, const std::vector<int32_t>& shape,
 void UpdateInt8OperatorVersions(ModelT* model) {
   for (int i = 0; i < model->operator_codes.size(); ++i) {
     const BuiltinOperator& op_code = model->operator_codes[i]->builtin_code;
-    if (op_code == BuiltinOperator_CONV_2D || op_code == BuiltinOperator_SVDF ||
-        op_code == BuiltinOperator_RNN ||
+    if (op_code == BuiltinOperator_BIDIRECTIONAL_SEQUENCE_LSTM ||
         op_code == BuiltinOperator_BIDIRECTIONAL_SEQUENCE_RNN ||
+        op_code == BuiltinOperator_EMBEDDING_LOOKUP ||
+        op_code == BuiltinOperator_RNN ||
         op_code == BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_LSTM ||
         op_code == BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_RNN) {
-      model->operator_codes[i]->version = 2;
-    } else if (op_code == BuiltinOperator_FULLY_CONNECTED ||
-               op_code == BuiltinOperator_BIDIRECTIONAL_SEQUENCE_LSTM ||
-               op_code == BuiltinOperator_EMBEDDING_LOOKUP ||
-               op_code == BuiltinOperator_LSTM) {
       model->operator_codes[i]->version = 3;
+    } else if (op_code == BuiltinOperator_LSTM ||
+               op_code == BuiltinOperator_SVDF) {
+      model->operator_codes[i]->version = 4;
+    } else if (op_code == BuiltinOperator_CONV_2D) {
+      model->operator_codes[i]->version = 5;
+    } else if (op_code == BuiltinOperator_DEPTHWISE_CONV_2D) {
+      model->operator_codes[i]->version = 6;
+    } else if (op_code == BuiltinOperator_FULLY_CONNECTED) {
+      model->operator_codes[i]->version = 9;
     }
   }
 }
@@ -354,7 +410,7 @@ TfLiteStatus QuantizeWeightsInt8(flatbuffers::FlatBufferBuilder* builder,
        ++subgraph_index) {
     SubGraphT* subgraph = model->subgraphs.at(subgraph_index).get();
 
-    absl::flat_hash_map<int32_t, TensorT*> tensor_map;
+    absl::flat_hash_map<int32_t, TensorPerChannel> tensor_map;
     for (int i = 0; i < subgraph->operators.size(); ++i) {
       OperatorT* op = subgraph->operators[i].get();
       TF_LITE_ENSURE_STATUS(InsertQuantizableInputTensorsFromOperator(
@@ -362,16 +418,22 @@ TfLiteStatus QuantizeWeightsInt8(flatbuffers::FlatBufferBuilder* builder,
           subgraph_index));
     }
 
-    for (std::pair<int32_t, TensorT*> tensor_pair : tensor_map) {
+    for (std::pair<int32_t, TensorPerChannel> tensor_pair : tensor_map) {
       // Quantize the tensor.
-      TF_LITE_ENSURE_STATUS(
-          utils::SymmetricQuantizeTensor(model.get(), tensor_pair.second));
+      if (tensor_pair.second.is_per_channel) {
+        TF_LITE_ENSURE_STATUS(utils::SymmetricQuantizeTensorPerChannel(
+            model.get(), tensor_pair.second.t, tensor_pair.second.channel_dim,
+            nullptr));
+      } else {
+        TF_LITE_ENSURE_STATUS(
+            utils::SymmetricQuantizeTensor(model.get(), tensor_pair.second.t));
+      }
     }
 
     // Examine the tensor consumers to determine which require dequantize ops.
     for (const auto& tensor_pair : tensor_map) {
       int32_t tensor_idx = tensor_pair.first;
-      TensorT* tensor = tensor_pair.second;
+      TensorT* tensor = tensor_pair.second.t;
       std::vector<ConsumerOpInfo> consumer_op_infos =
           GetTensorConsumers(model.get(), subgraph, tensor_idx);
       if (IsQuantizationPassThroughOps(model.get(), consumer_op_infos)) {
diff --git a/tensorflow/lite/tools/optimize/quantize_weights_test.cc b/tensorflow/lite/tools/optimize/quantize_weights_test.cc
index 76f2815ef0b..2f92a9ad71c 100644
--- a/tensorflow/lite/tools/optimize/quantize_weights_test.cc
+++ b/tensorflow/lite/tools/optimize/quantize_weights_test.cc
@@ -215,6 +215,8 @@ TEST_F(QuantizeWeightsTest, HybridConv) {
       } else if (quant_tensor->buffer() != 0) {
         EXPECT_EQ(quant_tensor->type(), TensorType_INT8)
             << quant_tensor->name()->str();
+        auto shape = GetAsVector(quant_tensor->shape());
+        EXPECT_EQ(quant_tensor->quantization()->scale()->size(), shape[0]);
       } else {
         EXPECT_EQ(quant_tensor->type(), TensorType_FLOAT32);
       }
diff --git a/tensorflow/lite/tools/optimize/test_util.cc b/tensorflow/lite/tools/optimize/test_util.cc
index 7d5e9d65f06..b22902a3e4b 100644
--- a/tensorflow/lite/tools/optimize/test_util.cc
+++ b/tensorflow/lite/tools/optimize/test_util.cc
@@ -48,6 +48,7 @@ const char* kModelWithArgMaxOp = "argmax.bin";
 const char* kModelWithFCOp = "fc.bin";
 
 const char* kModelMixed = "mixed.bin";
+const char* kModelMixed16x8 = "mixed16x8.bin";
 
 const char* kModelSplit = "split.bin";
 
@@ -61,6 +62,8 @@ const char* kModelWithMaximumOp = "maximum.bin";
 const char* kLstmCalibrated2 = "lstm_calibrated2.bin";
 const char* kLstmQuantized2 = "lstm_quantized2.bin";
 
+const char* kModelWithTranspose = "transpose.bin";
+
 const char* kSvdfCalibrated = "svdf_calibrated.bin";
 const char* kSvdfQuantized = "svdf_quantized.bin";
 
diff --git a/tensorflow/lite/tools/optimize/test_util.h b/tensorflow/lite/tools/optimize/test_util.h
index abcdbc21d36..99e8f0aedd3 100644
--- a/tensorflow/lite/tools/optimize/test_util.h
+++ b/tensorflow/lite/tools/optimize/test_util.h
@@ -76,6 +76,11 @@ extern const char* kModelWithFCOp;
 // reshape->custom->custom->squeeze.
 extern const char* kModelMixed;
 
+// Test model with mixed quantizable and
+// and un-quantizable ops for
+// activations in 16-bit.
+extern const char* kModelMixed16x8;
+
 // Test model with split op.
 extern const char* kModelSplit;
 
@@ -98,6 +103,9 @@ extern const char* kModelWithMaximumOp;
 extern const char* kLstmCalibrated2;
 extern const char* kLstmQuantized2;
 
+// Test model with a transpose op.
+extern const char* kModelWithTranspose;
+
 // Test model with SVDF op.
 extern const char* kSvdfCalibrated;
 extern const char* kSvdfQuantized;
diff --git a/tensorflow/lite/tools/optimize/testdata/mixed16x8.bin b/tensorflow/lite/tools/optimize/testdata/mixed16x8.bin
new file mode 100644
index 00000000000..c1f615e966e
Binary files /dev/null and b/tensorflow/lite/tools/optimize/testdata/mixed16x8.bin differ
diff --git a/tensorflow/lite/tools/optimize/testdata/transpose.bin b/tensorflow/lite/tools/optimize/testdata/transpose.bin
new file mode 100644
index 00000000000..a76886e5b47
Binary files /dev/null and b/tensorflow/lite/tools/optimize/testdata/transpose.bin differ
diff --git a/tensorflow/lite/tools/versioning/op_version.cc b/tensorflow/lite/tools/versioning/op_version.cc
index 118e2d420f8..a339976739b 100644
--- a/tensorflow/lite/tools/versioning/op_version.cc
+++ b/tensorflow/lite/tools/versioning/op_version.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "absl/strings/str_split.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
 namespace {
@@ -69,10 +70,13 @@ int GetBuiltinOperatorVersion(const OpSignature& op_sig) {
         return 3;
       }
       // If the op is a signed int8 hybrid operation, we need to return
-      // version 2.
+      // version 2 or 5 if per channel.
       if (op_sig.input_types.at(0) == TensorType_FLOAT32 &&
           op_sig.input_types.at(1) == TensorType_INT8 &&
           op_sig.output_types.at(0) == TensorType_FLOAT32) {
+        if (op_sig.options.conv_2d.is_per_channel_quantized) {
+          return 5;
+        }
         return 2;
       }
       return 1;
@@ -86,10 +90,13 @@ int GetBuiltinOperatorVersion(const OpSignature& op_sig) {
       }
 
       // If the op is a signed int8 hybrid operation, we need to return
-      // version 4.
+      // version 4 or 6 if per-channel.
       if (op_sig.input_types.at(0) == TensorType_FLOAT32 &&
           op_sig.input_types.at(1) == TensorType_INT8 &&
           op_sig.output_types.at(0) == TensorType_FLOAT32) {
+        if (op_sig.options.depthwise_conv_2d.is_per_channel_quantized) {
+          return 6;
+        }
         return 4;
       }
       // If the op has signed int8 op_sig.inputs and op_sig.outputs, its
@@ -153,6 +160,10 @@ int GetBuiltinOperatorVersion(const OpSignature& op_sig) {
       if (op_sig.input_types.at(0) == TensorType_FLOAT32 &&
           op_sig.input_types.at(1) == TensorType_INT8 &&
           op_sig.output_types.at(0) == TensorType_FLOAT32) {
+        if (op_sig.options.fully_connected.asymmetric_quantize_inputs) {
+          // This is to use the updated quantization scheme.
+          return 9;
+        }
         return 3;
       }
       // For float and uint8 fixed point kernels, if the weight is
@@ -184,6 +195,10 @@ int GetBuiltinOperatorVersion(const OpSignature& op_sig) {
       if (op_sig.input_types.at(0) == TensorType_FLOAT32 &&
           op_sig.input_types.at(1) == TensorType_INT8 &&
           op_sig.output_types.at(0) == TensorType_FLOAT32) {
+        // This is to use the updated quantization scheme
+        if (op_sig.options.input_quantization.asymmetric_quantize_inputs) {
+          return 4;
+        }
         return 2;
       }
       return 1;
@@ -250,6 +265,9 @@ int GetBuiltinOperatorVersion(const OpSignature& op_sig) {
           op_sig.input_types.at(0) == TensorType_FLOAT32 &&
           op_sig.input_types.at(2) == TensorType_INT8 &&
           op_sig.output_types.at(0) == TensorType_FLOAT32) {
+        if (op_sig.options.lstm.asymmetric_quantize_inputs) {
+          return 4;
+        }
         return 3;
       }
       // KERNEL_BASIC was added in version 2.
@@ -264,6 +282,9 @@ int GetBuiltinOperatorVersion(const OpSignature& op_sig) {
       if (op_sig.input_types.at(0) == TensorType_FLOAT32 &&
           op_sig.input_types.at(2) == TensorType_INT8 &&
           op_sig.output_types.at(0) == TensorType_FLOAT32) {
+        if (op_sig.options.lstm.asymmetric_quantize_inputs) {
+          return 3;
+        }
         return 2;
       }
       return 1;
@@ -449,7 +470,6 @@ int GetBuiltinOperatorVersion(const OpSignature& op_sig) {
         return 2;
       }
       return 1;
-
     case BuiltinOperator_TANH:
     case BuiltinOperator_LOGISTIC:
       if (op_sig.input_types.at(0) == TensorType_INT16 &&
@@ -499,6 +519,19 @@ int GetBuiltinOperatorVersion(const OpSignature& op_sig) {
       }
       return 1;
 
+    case BuiltinOperator_RNN:
+    case BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_RNN:
+    case BuiltinOperator_BIDIRECTIONAL_SEQUENCE_RNN:
+    case BuiltinOperator_BIDIRECTIONAL_SEQUENCE_LSTM:
+      if (op_sig.input_types.at(1) == TensorType_INT8 &&
+          op_sig.output_types.at(0) == TensorType_FLOAT32) {
+        if (op_sig.options.input_quantization.asymmetric_quantize_inputs) {
+          return 3;
+        } else {
+          return 2;
+        }
+      }
+      return 1;
     case BuiltinOperator_ADD:
     case BuiltinOperator_PAD:
     case BuiltinOperator_PADV2:
@@ -518,6 +551,7 @@ int GetBuiltinOperatorVersion(const OpSignature& op_sig) {
     case BuiltinOperator_LESS:
     case BuiltinOperator_LESS_EQUAL:
     case BuiltinOperator_SELECT:
+    case BuiltinOperator_BATCH_MATMUL:
       if (op_sig.input_types.at(0) == TensorType_INT8) {
         return 2;
       }
@@ -564,6 +598,16 @@ OpSignature GetOpSignature(const OperatorCode* op_code, const Operator* op,
         op_sig.options.depthwise_conv_2d.dilation_h_factor =
             conv_option->dilation_h_factor();
       }
+      const Tensor* filter_tensor =
+          subgraph->tensors()->Get(op->inputs()->Get(1));
+      const QuantizationParameters* filter_quant =
+          filter_tensor->quantization();
+      int num_channels = filter_tensor->shape()->Get(3);
+      if (filter_quant && filter_quant->scale() &&
+          filter_quant->scale()->Length() &&
+          filter_quant->scale()->Length() == num_channels) {
+        op_sig.options.depthwise_conv_2d.is_per_channel_quantized = true;
+      }
     } break;
 
     case BuiltinOperator_FAKE_QUANT: {
@@ -582,6 +626,8 @@ OpSignature GetOpSignature(const OperatorCode* op_code, const Operator* op,
             fully_connected_option->keep_num_dims();
         op_sig.options.fully_connected.weights_format =
             fully_connected_option->weights_format();
+        op_sig.options.fully_connected.asymmetric_quantize_inputs =
+            fully_connected_option->asymmetric_quantize_inputs();
       }
 
       const Tensor* weight_tensor =
@@ -642,6 +688,18 @@ OpSignature GetOpSignature(const OperatorCode* op_code, const Operator* op,
         op_sig.options.resize.align_corners = resize_nn_option->align_corners();
       }
     } break;
+    case BuiltinOperator_CONV_2D: {
+      const Tensor* filter_tensor =
+          subgraph->tensors()->Get(op->inputs()->Get(1));
+      const QuantizationParameters* filter_quant =
+          filter_tensor->quantization();
+      int num_channels = filter_tensor->shape()->Get(0);
+      if (filter_quant && filter_quant->scale() &&
+          filter_quant->scale()->Length() &&
+          filter_quant->scale()->Length() == num_channels) {
+        op_sig.options.conv_2d.is_per_channel_quantized = true;
+      }
+    } break;
     // TODO(b/150176627): Add tests for GetOpSignature.
     case BuiltinOperator_STRIDED_SLICE:
     case BuiltinOperator_SPACE_TO_BATCH_ND:
@@ -649,7 +707,6 @@ OpSignature GetOpSignature(const OperatorCode* op_code, const Operator* op,
     case BuiltinOperator_TRANSPOSE: {
       op_sig.options.single_input_op.num_dims = GetNumDims(subgraph, op, 0);
     } break;
-
     case BuiltinOperator_SUB:
     case BuiltinOperator_DIV:
     case BuiltinOperator_MAXIMUM:
diff --git a/tensorflow/lite/tools/versioning/op_version.h b/tensorflow/lite/tools/versioning/op_version.h
index df74ffaf6dd..71362001387 100644
--- a/tensorflow/lite/tools/versioning/op_version.h
+++ b/tensorflow/lite/tools/versioning/op_version.h
@@ -30,6 +30,7 @@ typedef struct {
     struct {
       int32_t dilation_w_factor;
       int32_t dilation_h_factor;
+      bool is_per_channel_quantized;
     } depthwise_conv_2d;
     struct {
       bool narrow_range;
@@ -40,6 +41,7 @@ typedef struct {
       // TODO(b/156530611): Make this global when more ops support sparse
       // computation.
       bool sparse_weight;
+      bool asymmetric_quantize_inputs;
     } fully_connected;
     struct {
       float input1_scale;
@@ -48,6 +50,7 @@ typedef struct {
     } mul;
     struct {
       LSTMKernelType kernel_type;
+      bool asymmetric_quantize_inputs;
     } lstm;
     struct {
       bool half_pixel_centers;
@@ -60,6 +63,12 @@ typedef struct {
       int32_t num_dims;
       bool need_broadcast;
     } broadcast;
+    struct {
+      bool is_per_channel_quantized;
+    } conv_2d;
+    struct {
+      bool asymmetric_quantize_inputs;
+    } input_quantization;
   } options;
 } OpSignature;
 
diff --git a/tensorflow/lite/tools/versioning/op_version_test.cc b/tensorflow/lite/tools/versioning/op_version_test.cc
index 4017fc3bff0..e9fd857a3f5 100644
--- a/tensorflow/lite/tools/versioning/op_version_test.cc
+++ b/tensorflow/lite/tools/versioning/op_version_test.cc
@@ -361,6 +361,19 @@ TEST(OpVersionTest, VersioningFullyConnectedTest) {
   fake_op_sig.options.fully_connected = {
       false, FullyConnectedOptionsWeightsFormat_DEFAULT, true};
   EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 8);
+
+  fake_op_sig = {
+      .op = BuiltinOperator_FULLY_CONNECTED,
+      .input_types =
+          std::vector<TensorType>{TensorType_FLOAT32, TensorType_INT8,
+                                  TensorType_FLOAT32},
+      .output_types = std::vector<TensorType>{TensorType_FLOAT32},
+  };
+  fake_op_sig.options.fully_connected = {
+      false, FullyConnectedOptionsWeightsFormat_DEFAULT, false, false};
+  EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 3);
+  fake_op_sig.options.fully_connected.asymmetric_quantize_inputs = true;
+  EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 9);
 }
 
 TEST(OpVersionTest, VersioningDequantizeTest) {
@@ -412,6 +425,15 @@ TEST(OpVersionTest, VersioningConv2DTest) {
       .output_types = std::vector<TensorType>{TensorType_FLOAT32},
   };
   EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 2);
+
+  fake_op_sig = {
+      .op = BuiltinOperator_CONV_2D,
+      .input_types =
+          std::vector<TensorType>{TensorType_FLOAT32, TensorType_INT8},
+      .output_types = std::vector<TensorType>{TensorType_FLOAT32},
+  };
+  fake_op_sig.options.conv_2d.is_per_channel_quantized = true;
+  EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 5);
 }
 
 TEST(OpVersionTest, VersioningFloorDivOperatorTest) {
@@ -479,6 +501,8 @@ TEST(OpVersionTest, VersioningSVDFOperatorTest) {
       .output_types = std::vector<TensorType>{TensorType_FLOAT32},
   };
   EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 2);
+  fake_op_sig.options.input_quantization.asymmetric_quantize_inputs = true;
+  EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 4);
 
   fake_op_sig = {
       .op = BuiltinOperator_SVDF,
@@ -489,6 +513,7 @@ TEST(OpVersionTest, VersioningSVDFOperatorTest) {
   };
   EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 3);
 }
+
 TEST(OpVersionTest, VersioningDepthwiseConv2DTest) {
   OpSignature fake_op_sig = {
       .op = BuiltinOperator_DEPTHWISE_CONV_2D,
@@ -497,6 +522,8 @@ TEST(OpVersionTest, VersioningDepthwiseConv2DTest) {
       .output_types = std::vector<TensorType>{TensorType_FLOAT32},
   };
   EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 4);
+  fake_op_sig.options.depthwise_conv_2d.is_per_channel_quantized = true;
+  EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 6);
 
   fake_op_sig = {
       .op = BuiltinOperator_DEPTHWISE_CONV_2D,
diff --git a/tensorflow/lite/tools/versioning/runtime_version.cc b/tensorflow/lite/tools/versioning/runtime_version.cc
index 92a7001606f..efec5a7da18 100644
--- a/tensorflow/lite/tools/versioning/runtime_version.cc
+++ b/tensorflow/lite/tools/versioning/runtime_version.cc
@@ -58,15 +58,18 @@ std::string FindMinimumRuntimeVersionForOp(tflite::BuiltinOperator op_code,
               {{BuiltinOperator_AVERAGE_POOL_2D, 2}, "1.14.0"},
               {{BuiltinOperator_AVERAGE_POOL_2D, 3}, kPendingReleaseVersion},
               {{BuiltinOperator_BATCH_MATMUL, 1}, kPendingReleaseVersion},
+              {{BuiltinOperator_BATCH_MATMUL, 2}, kPendingReleaseVersion},
               {{BuiltinOperator_CONV_2D, 1}, "1.5.0"},
               {{BuiltinOperator_CONV_2D, 2}, "1.14.0"},
               {{BuiltinOperator_CONV_2D, 3}, "1.14.0"},
               {{BuiltinOperator_CONV_2D, 4}, kPendingReleaseVersion},
+              {{BuiltinOperator_CONV_2D, 5}, kPendingReleaseVersion},
               {{BuiltinOperator_DEPTHWISE_CONV_2D, 1}, "1.5.0"},
               {{BuiltinOperator_DEPTHWISE_CONV_2D, 2}, "1.12.0"},
               {{BuiltinOperator_DEPTHWISE_CONV_2D, 3}, "1.14.0"},
               {{BuiltinOperator_DEPTHWISE_CONV_2D, 4}, "2.2.0"},
               {{BuiltinOperator_DEPTHWISE_CONV_2D, 5}, kPendingReleaseVersion},
+              {{BuiltinOperator_DEPTHWISE_CONV_2D, 6}, kPendingReleaseVersion},
               {{BuiltinOperator_ADD, 1}, "1.5.0"},
               {{BuiltinOperator_ADD, 2}, "1.14.0"},
               {{BuiltinOperator_ADD_N, 1}, "1.14.0"},
@@ -101,6 +104,7 @@ std::string FindMinimumRuntimeVersionForOp(tflite::BuiltinOperator op_code,
               {{BuiltinOperator_FULLY_CONNECTED, 6}, "2.1.0"},
               {{BuiltinOperator_FULLY_CONNECTED, 7}, kPendingReleaseVersion},
               {{BuiltinOperator_FULLY_CONNECTED, 8}, kPendingReleaseVersion},
+              {{BuiltinOperator_FULLY_CONNECTED, 9}, kPendingReleaseVersion},
               {{BuiltinOperator_GATHER, 1}, "1.6.0"},
               {{BuiltinOperator_GATHER, 2}, "1.14.0"},
               {{BuiltinOperator_GATHER, 3}, "1.15.0"},
@@ -110,6 +114,7 @@ std::string FindMinimumRuntimeVersionForOp(tflite::BuiltinOperator op_code,
               {{BuiltinOperator_SVDF, 1}, "1.5.0"},
               {{BuiltinOperator_SVDF, 2}, "1.14.0"},
               {{BuiltinOperator_SVDF, 3}, "2.2.0"},
+              {{BuiltinOperator_SVDF, 4}, kPendingReleaseVersion},
               {{BuiltinOperator_L2_NORMALIZATION, 1}, "1.5.0"},
               {{BuiltinOperator_L2_NORMALIZATION, 2}, "1.14.0"},
               {{BuiltinOperator_L2_POOL_2D, 1}, "1.5.0"},
@@ -150,13 +155,18 @@ std::string FindMinimumRuntimeVersionForOp(tflite::BuiltinOperator op_code,
               {{BuiltinOperator_LSTM, 1}, "1.7.0"},
               {{BuiltinOperator_LSTM, 2}, "1.10.0"},
               {{BuiltinOperator_LSTM, 3}, "1.14.0"},
+              {{BuiltinOperator_LSTM, 4}, kPendingReleaseVersion},
               {{BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_LSTM, 1}, "1.13.1"},
               {{BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_LSTM, 2}, "1.14.0"},
+              {{BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_LSTM, 3},
+               kPendingReleaseVersion},
               {{BuiltinOperator_BIDIRECTIONAL_SEQUENCE_LSTM, 1}, "1.14.0"},
               {{BuiltinOperator_BIDIRECTIONAL_SEQUENCE_LSTM, 2}, "1.14.0"},
               {{BuiltinOperator_BIDIRECTIONAL_SEQUENCE_LSTM, 3}, "1.14.0"},
               {{BuiltinOperator_BIDIRECTIONAL_SEQUENCE_RNN, 1}, "1.14.0"},
               {{BuiltinOperator_BIDIRECTIONAL_SEQUENCE_RNN, 2}, "1.14.0"},
+              {{BuiltinOperator_BIDIRECTIONAL_SEQUENCE_RNN, 3},
+               kPendingReleaseVersion},
               {{BuiltinOperator_MEAN, 1}, "1.6.0"},
               {{BuiltinOperator_MEAN, 2}, "1.14.0"},
               {{BuiltinOperator_SUM, 1}, "1.10.0"},
@@ -178,6 +188,7 @@ std::string FindMinimumRuntimeVersionForOp(tflite::BuiltinOperator op_code,
                kPendingReleaseVersion},
               {{BuiltinOperator_RNN, 1}, "1.5.0"},
               {{BuiltinOperator_RNN, 2}, "1.14.0"},
+              {{BuiltinOperator_RNN, 3}, kPendingReleaseVersion},
               {{BuiltinOperator_SKIP_GRAM, 1}, "1.5.0"},
               {{BuiltinOperator_SQUEEZE, 1}, "1.6.0"},
               {{BuiltinOperator_SPLIT, 1}, "1.5.0"},
@@ -232,6 +243,8 @@ std::string FindMinimumRuntimeVersionForOp(tflite::BuiltinOperator op_code,
               {{BuiltinOperator_UNIQUE, 1}, "1.14.0"},
               {{BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_RNN, 1}, "1.14.0"},
               {{BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_RNN, 2}, "1.14.0"},
+              {{BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_RNN, 3},
+               kPendingReleaseVersion},
               {{BuiltinOperator_WHERE, 1}, "1.14.0"},
               {{BuiltinOperator_DEQUANTIZE, 1}, "1.13.1"},
               {{BuiltinOperator_DEQUANTIZE, 2}, "1.14.0"},
diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index d141b719aef..5f9e2dfb1ff 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -3,7 +3,7 @@
 #  ":platform" - Low-level and platform-specific Python code.
 
 load("//tensorflow:tensorflow.bzl", "py_strict_library")
-load("//tensorflow:tensorflow.bzl", "cc_header_only_library", "if_mlir", "if_not_windows", "if_xla_available", "py_test", "py_tests", "tf_cc_shared_object", "tf_cuda_library", "tf_gen_op_wrapper_py")
+load("//tensorflow:tensorflow.bzl", "cc_header_only_library", "if_mlir", "if_not_windows", "if_xla_available", "py_test", "py_tests", "tf_cc_shared_object", "tf_cc_test", "tf_cuda_library", "tf_gen_op_wrapper_py")
 
 # buildifier: disable=same-origin-load
 load("//tensorflow:tensorflow.bzl", "tf_monitoring_python_deps")
@@ -788,6 +788,16 @@ tf_python_pybind_extension(
     ],
 )
 
+tf_python_pybind_extension(
+    name = "_pywrap_tf32_execution",
+    srcs = ["util/tf32.cc"],
+    module_name = "_pywrap_tf32_execution",
+    deps = [
+        "//tensorflow/core/platform:tf32_utils",
+        "@pybind11",
+    ],
+)
+
 tf_python_pybind_extension(
     name = "_pywrap_util_port",
     srcs = ["util/port_wrapper.cc"],
@@ -1236,6 +1246,19 @@ cc_library(
     alwayslink = 1,
 )
 
+tf_cc_test(
+    name = "python_op_gen_test",
+    srcs = ["framework/python_op_gen_test.cc"],
+    deps = [
+        ":python_op_gen",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:op_gen_lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
 py_library(
     name = "framework_for_generated_wrappers",
     srcs_version = "PY2AND3",
@@ -5277,6 +5300,7 @@ cuda_py_test(
     shard_count = 10,
     tags = [
         "no_windows_gpu",
+        "noasan",  # b/159332048
         "nomsan",  # b/148630708
     ],
     deps = [
@@ -5664,6 +5688,7 @@ py_library(
         "//tensorflow:composite_tensor_whitelist",
     ],
     deps = [
+        ":_pywrap_tf32_execution",
         ":tf_decorator",
         ":tf_export",
         ":tf_stack",
@@ -6058,7 +6083,12 @@ pywrap_tensorflow_macro(
         "@ngraph_tf//:ngraph_tf",
     ]) + if_xla_available([
         "//tensorflow/compiler/aot:tfcompile_lib",
-    ]),
+    ]) + select({
+        "//tensorflow:windows": [],  # TODO(b/159077703): Enable Flex on Windows
+        "//conditions:default": [
+            "//tensorflow/lite/delegates/flex:delegate",
+        ],
+    }),
 )
 
 # ** Targets for Windows build (start) **
diff --git a/tensorflow/python/autograph/operators/control_flow.py b/tensorflow/python/autograph/operators/control_flow.py
index 77db7579ece..8b4e97c238c 100644
--- a/tensorflow/python/autograph/operators/control_flow.py
+++ b/tensorflow/python/autograph/operators/control_flow.py
@@ -80,6 +80,7 @@ from tensorflow.python.framework import func_graph
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import control_flow_util
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.ops.ragged import ragged_tensor
@@ -429,7 +430,9 @@ def _known_len_tf_for_stmt(
       return control_flow_ops.cond(main_test, extra_test, lambda: False)
     return main_test
 
-  opts['maximum_iterations'] = n
+  # TODO(b/159186914): Remove.
+  if not control_flow_util.GraphOrParentsInXlaContext(ops.get_default_graph()):
+    opts['maximum_iterations'] = n
 
   _tf_while_stmt(
       aug_test,
@@ -475,7 +478,9 @@ def _tf_ragged_for_stmt(
       return control_flow_ops.cond(main_test, extra_test, lambda: False)
     return main_test
 
-  opts['maximum_iterations'] = n
+  # TODO(b/159186914): Remove.
+  if not control_flow_util.GraphOrParentsInXlaContext(ops.get_default_graph()):
+    opts['maximum_iterations'] = n
 
   _tf_while_stmt(
       aug_test,
@@ -524,8 +529,10 @@ def _tf_range_for_stmt(
       return control_flow_ops.cond(main_test, extra_test, lambda: False)
     return main_test
 
-  opts['maximum_iterations'] = math_ops.cast(
-      misc.get_range_len(start, limit, delta), dtypes.int32)
+  # TODO(b/159186914): Remove.
+  if not control_flow_util.GraphOrParentsInXlaContext(ops.get_default_graph()):
+    opts['maximum_iterations'] = math_ops.cast(
+        misc.get_range_len(start, limit, delta), dtypes.int32)
 
   _tf_while_stmt(
       aug_test,
diff --git a/tensorflow/python/autograph/pyct/parser.py b/tensorflow/python/autograph/pyct/parser.py
index 747d56e401d..9ac7c2ef2a6 100644
--- a/tensorflow/python/autograph/pyct/parser.py
+++ b/tensorflow/python/autograph/pyct/parser.py
@@ -22,6 +22,7 @@ from __future__ import division
 from __future__ import print_function
 
 import re
+import sys
 import textwrap
 import tokenize
 
@@ -33,11 +34,18 @@ from tensorflow.python.autograph.pyct import errors
 from tensorflow.python.autograph.pyct import inspect_utils
 
 
-STANDARD_PREAMBLE = textwrap.dedent("""
-    from __future__ import division
-    from __future__ import print_function
+PY2_PREAMBLE = textwrap.dedent("""
+from __future__ import division
+from __future__ import print_function
 """)
-STANDARD_PREAMBLE_LEN = 2
+PY3_PREAMBLE = ''
+
+if sys.version_info >= (3,):
+  STANDARD_PREAMBLE = PY3_PREAMBLE
+else:
+  STANDARD_PREAMBLE = PY2_PREAMBLE
+
+STANDARD_PREAMBLE_LEN = STANDARD_PREAMBLE.count('__future__')
 
 
 _LEADING_WHITESPACE = re.compile(r'\s*')
diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index 314acfdd38f..521d7eaf30f 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -33,7 +33,7 @@ from tensorflow.python.util.tf_export import tf_export
 # This value changes every day with an automatic CL. It can be modified in code
 # via `forward_compatibility_horizon()` or with the environment variable
 # TF_FORWARD_COMPATIBILITY_DELTA_DAYS, which is added to the compatibility date.
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2020, 6, 16)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2020, 6, 23)
 _FORWARD_COMPATIBILITY_DELTA_DAYS_VAR_NAME = "TF_FORWARD_COMPATIBILITY_DELTA_DAYS"
 _FORWARD_COMPATIBILITY_DATE_NUMBER = None
 
diff --git a/tensorflow/python/compiler/tensorrt/test/tf_trt_integration_test_base.py b/tensorflow/python/compiler/tensorrt/test/tf_trt_integration_test_base.py
index 8b93750fde4..87fa55a32bd 100644
--- a/tensorflow/python/compiler/tensorrt/test/tf_trt_integration_test_base.py
+++ b/tensorflow/python/compiler/tensorrt/test/tf_trt_integration_test_base.py
@@ -971,4 +971,5 @@ def _AddTests(test_class):
 
 
 if is_tensorrt_enabled():
+  os.environ["TF_TRT_ALLOW_ENGINE_NATIVE_SEGMENT_EXECUTION"] = "False"
   _AddTests(TfTrtIntegrationTestBase)
diff --git a/tensorflow/python/compiler/tensorrt/test/trt_mode_test.py b/tensorflow/python/compiler/tensorrt/test/trt_mode_test.py
index 878ab4cbd8e..c67de7432cd 100644
--- a/tensorflow/python/compiler/tensorrt/test/trt_mode_test.py
+++ b/tensorflow/python/compiler/tensorrt/test/trt_mode_test.py
@@ -40,6 +40,11 @@ class TrtModeTestBase(trt_test.TfTrtIntegrationTestBase):
     q = q + 5.0
     return array_ops.identity(q, name="output_0")
 
+  def ShouldRunTest(self, run_params):
+    # Squeeze op produces dynamic shaped values. Therefore, we don't run the
+    # test with static engine to avoid native segment execution.
+    return (run_params.dynamic_engine, "test dynamic engine only")
+
   def GetParams(self):
     """The input has 1 as a first dimension, which is removed by the squeeze.
 
diff --git a/tensorflow/python/compiler/tensorrt/test/vgg_block_nchw_test.py b/tensorflow/python/compiler/tensorrt/test/vgg_block_nchw_test.py
index 368ffad30a4..8fd9606812d 100644
--- a/tensorflow/python/compiler/tensorrt/test/vgg_block_nchw_test.py
+++ b/tensorflow/python/compiler/tensorrt/test/vgg_block_nchw_test.py
@@ -18,6 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import os
+
 import numpy as np
 
 from tensorflow.python.compiler.tensorrt.test import tf_trt_integration_test_base as trt_test
@@ -69,6 +71,11 @@ class VGGBlockNCHWTest(trt_test.TfTrtIntegrationTestBase):
     """Return the expected engines to build."""
     return ["TRTEngineOp_0"]
 
+  # TODO(b/159459919): remove this routine to disallow native segment execution.
+  def setUp(self):
+    super(trt_test.TfTrtIntegrationTestBase, self).setUp()
+    os.environ["TF_TRT_ALLOW_ENGINE_NATIVE_SEGMENT_EXECUTION"] = "True"
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/compiler/tensorrt/test/vgg_block_test.py b/tensorflow/python/compiler/tensorrt/test/vgg_block_test.py
index f1b41327a58..9d81cd6dcc3 100644
--- a/tensorflow/python/compiler/tensorrt/test/vgg_block_test.py
+++ b/tensorflow/python/compiler/tensorrt/test/vgg_block_test.py
@@ -18,6 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import os
+
 import numpy as np
 
 from tensorflow.python.compiler.tensorrt.test import tf_trt_integration_test_base as trt_test
@@ -60,6 +62,11 @@ class VGGBlockTest(trt_test.TfTrtIntegrationTestBase):
     """Return the expected engines to build."""
     return ["TRTEngineOp_0"]
 
+  # TODO(b/159459919): remove this routine to disallow native segment execution.
+  def setUp(self):
+    super(trt_test.TfTrtIntegrationTestBase, self).setUp()
+    os.environ["TF_TRT_ALLOW_ENGINE_NATIVE_SEGMENT_EXECUTION"] = "True"
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/compiler/tensorrt/trt_convert_test.py b/tensorflow/python/compiler/tensorrt/trt_convert_test.py
index df21e93f836..05ff6fcaebe 100644
--- a/tensorflow/python/compiler/tensorrt/trt_convert_test.py
+++ b/tensorflow/python/compiler/tensorrt/trt_convert_test.py
@@ -439,6 +439,7 @@ class TrtConvertTest(test_util.TensorFlowTestCase, parameterized.TestCase):
       self,
       input_saved_model_dir,
       input_saved_model_signature_key=_SAVED_MODEL_SIGNATURE_KEY,
+      max_workspace_size_bytes=10 << 20,  # Use a smaller workspace.
       precision_mode=trt_convert.TrtPrecisionMode.FP32,
       is_dynamic_op=True,
       maximum_cached_engines=2):
@@ -446,7 +447,7 @@ class TrtConvertTest(test_util.TensorFlowTestCase, parameterized.TestCase):
         input_saved_model_dir=input_saved_model_dir,
         input_saved_model_signature_key=input_saved_model_signature_key,
         conversion_params=trt_convert.DEFAULT_TRT_CONVERSION_PARAMS._replace(
-            max_workspace_size_bytes=10 << 20,  # Use a smaller workspace.
+            max_workspace_size_bytes=max_workspace_size_bytes,
             precision_mode=precision_mode,
             is_dynamic_op=is_dynamic_op,
             maximum_cached_engines=maximum_cached_engines))
@@ -924,6 +925,36 @@ class TrtConvertTest(test_util.TensorFlowTestCase, parameterized.TestCase):
         # to fall back to TF function.
         self._TestRun(sess, 2)
 
+  @test_util.run_v2_only
+  def testTrtGraphConverter_AllowEngineNativeSegmentExecution(self):
+    if not is_tensorrt_enabled():
+      return
+
+    np_input1, np_input2 = self._RandomInput([4, 1, 1])
+
+    # Create a model and save it.
+    input_saved_model_dir = self.mkdtemp()
+    root = self._GetModelForV2()
+    save.save(root, input_saved_model_dir,
+              {_SAVED_MODEL_SIGNATURE_KEY: root.run})
+
+    def _InputFn():
+      yield np_input1, np_input2
+
+    # Run TRT conversion and request an unreasonably large workspace.
+    converter = self._CreateConverterV2(
+        input_saved_model_dir, max_workspace_size_bytes=10 << 40)
+    converter.convert()
+
+    os.environ["TF_TRT_ALLOW_ENGINE_NATIVE_SEGMENT_EXECUTION"] = "False"
+    with self.assertRaisesRegex(
+        errors.AbortedError,
+        r"User disallowed engine native segment execution"):
+      converter.build(input_fn=_InputFn)
+
+    os.environ["TF_TRT_ALLOW_ENGINE_NATIVE_SEGMENT_EXECUTION"] = "True"
+    converter.build(input_fn=_InputFn)
+
   @test_util.run_v2_only
   def testBackwardCompatibility(self):
     """Load and execute a model that was saved in TF2.0."""
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/map_vectorization_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/map_vectorization_test.py
index d1a68931d38..a806e745ef9 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/map_vectorization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/map_vectorization_test.py
@@ -44,6 +44,7 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.ops import parsing_ops
 from tensorflow.python.ops import script_ops
+from tensorflow.python.ops import special_math_ops
 from tensorflow.python.platform import test
 
 
@@ -90,8 +91,9 @@ def _unary_real_test_combinations():
       ("Asinh", math_ops.asinh),
       ("Atan", math_ops.atan),
       ("Atanh", math_ops.atanh),
-      ("BesselI0e", math_ops.bessel_i0e),
-      ("BesselI1e", math_ops.bessel_i1e),
+      # TODO(b/157272291): Add testing for more special functions.
+      ("BesselI0e", special_math_ops.bessel_i0e),
+      ("BesselI1e", special_math_ops.bessel_i1e),
       ("Ceil", math_ops.ceil),
       ("Cos", math_ops.cos),
       ("Cosh", math_ops.cosh),
diff --git a/tensorflow/python/data/experimental/ops/data_service_ops.py b/tensorflow/python/data/experimental/ops/data_service_ops.py
index 39790d843ba..01ec155a89c 100644
--- a/tensorflow/python/data/experimental/ops/data_service_ops.py
+++ b/tensorflow/python/data/experimental/ops/data_service_ops.py
@@ -240,8 +240,11 @@ def _distribute(processing_mode,
         task_refresh_interval_hint_ms=task_refresh_interval_hint_ms)
     # TODO(b/157105111): Make this an autotuned parallel map when we have a way
     # to limit memory usage.
+    # The value 16 is chosen based on experience with pipelines that require
+    # more than 8 parallel calls to prevent this stage from being a bottleneck.
     dataset = dataset.map(
-        lambda x: compression_ops.uncompress(x, output_spec=uncompressed_spec))
+        lambda x: compression_ops.uncompress(x, output_spec=uncompressed_spec),
+        num_parallel_calls=16)
 
     # Disable autosharding for shared jobs.
     if job_name:
diff --git a/tensorflow/python/data/kernel_tests/data_service_ops_test.py b/tensorflow/python/data/kernel_tests/data_service_ops_test.py
index d316009ce0c..488bf97f184 100644
--- a/tensorflow/python/data/kernel_tests/data_service_ops_test.py
+++ b/tensorflow/python/data/kernel_tests/data_service_ops_test.py
@@ -23,6 +23,7 @@ from absl.testing import parameterized
 
 from tensorflow.python.data.experimental.ops import data_service_ops
 from tensorflow.python.data.experimental.ops import distribute_options
+from tensorflow.python.data.experimental.ops import testing
 from tensorflow.python.data.experimental.service import server_lib
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
@@ -30,6 +31,8 @@ from tensorflow.python.eager import def_function
 from tensorflow.python.framework import combinations
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
+from tensorflow.python.framework import random_seed
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.platform import test
@@ -76,6 +79,28 @@ class DataServiceOpsTest(test_base.DatasetTestBase, parameterized.TestCase):
     results = [elem.numpy() for elem in ds]
     self.assertEqual(list(range(num_elements)), results)
 
+  @combinations.generate(test_base.eager_only_combinations())
+  def testDifferentShuffleOrders(self):
+    random_seed.set_random_seed(None)
+    num_elements = 100
+    master_address = self.create_cluster(2)
+    ds = dataset_ops.Dataset.range(num_elements)
+    ds = ds.shuffle(num_elements)
+    ds = _make_distributed_dataset(ds, master_address)
+    output = [elem.numpy() for elem in ds]
+
+    # The output will be two sequences of range(num_elements)
+    # non-deterministically interleaved together. If the orders of the elements
+    # were the same, first_order and second_order computed below will be equal.
+    first_order = {}
+    second_order = {}
+    for element in output:
+      if element in first_order:
+        second_order[element] = len(second_order)
+      else:
+        first_order[element] = len(first_order)
+    self.assertNotEqual(first_order, second_order)
+
   @combinations.generate(test_base.eager_only_combinations())
   def testMultipleEpochs(self):
     num_elements = 3
@@ -201,13 +226,18 @@ class DataServiceOpsTest(test_base.DatasetTestBase, parameterized.TestCase):
     self._new_worker = server_lib.WorkerServer(
         port=port, master_address=self._master._address, protocol=PROTOCOL)
 
-    # The dataset starts over now that we read from the new worker.
-    for i in range(num_elements):
+    # There may have been some elements prefetched from the first worker
+    # before it was stopped.
+    while True:
+      val = next(iterator).numpy()
+      if val == 0:
+        break
+
+    # The dataset starts over now that we read from the new worker.
+    # TODO(b/157086991): Iterate until end of sequence when we support
+    # detecting lost workers.
+    for i in range(1, num_elements // 2):
       val = next(iterator).numpy()
-      if val == midpoint and i != midpoint:
-        # There may have been one last element prefetched from the first worker
-        # before it was stopped.
-        val = next(iterator).numpy()
       self.assertEqual(i, val)
 
   @combinations.generate(test_base.eager_only_combinations())
@@ -248,7 +278,7 @@ class DataServiceOpsTest(test_base.DatasetTestBase, parameterized.TestCase):
 
   @combinations.generate(test_base.eager_only_combinations())
   def testSharedJobName(self):
-    num_elements = 10
+    num_elements = 100
     master_address = self.create_cluster(1)
     ds = dataset_ops.Dataset.range(num_elements)
     ds1 = _make_distributed_dataset(ds, master_address, job_name="job_name")
@@ -256,7 +286,7 @@ class DataServiceOpsTest(test_base.DatasetTestBase, parameterized.TestCase):
     iter1 = iter(ds1)
     iter2 = iter(ds2)
     results = []
-    for _ in range(3):
+    for _ in range(num_elements // 5):
       results.append(next(iter1).numpy())
       results.append(next(iter2).numpy())
     for elem in iter1:
@@ -291,7 +321,7 @@ class DataServiceOpsTest(test_base.DatasetTestBase, parameterized.TestCase):
 
   @combinations.generate(test_base.eager_only_combinations())
   def testSharedJobNameRepeat(self):
-    num_elements = 10
+    num_elements = 100
     num_repetitions = 3
     master_address = self.create_cluster(1)
     ds = dataset_ops.Dataset.range(num_elements)
@@ -302,9 +332,9 @@ class DataServiceOpsTest(test_base.DatasetTestBase, parameterized.TestCase):
     results = []
     iter1 = iter(ds1)
     iter2 = iter(ds2)
-    for _ in range(((num_elements * num_repetitions) // 2) - 1):
+    for _ in range((num_elements * num_repetitions) // 5):
       results.append(next(iter1).numpy())
-    for _ in range(((num_elements * num_repetitions) // 2) - 1):
+    for _ in range((num_elements * num_repetitions) // 5):
       results.append(next(iter2).numpy())
     for elem in iter1:
       results.append(elem.numpy())
@@ -312,6 +342,34 @@ class DataServiceOpsTest(test_base.DatasetTestBase, parameterized.TestCase):
       results.append(elem.numpy())
     self.assertCountEqual(num_repetitions * list(range(num_elements)), results)
 
+  @combinations.generate(test_base.eager_only_combinations())
+  def testApplyDeterminismOption(self):
+    elements = list(range(10))
+    master_address = self.create_cluster(1)
+
+    def dataset_fn(delay_ms):
+
+      def interleave_fn(x):
+        ds = dataset_ops.Dataset.from_tensors(x)
+        if math_ops.equal(x, 0):
+          ds = ds.apply(testing.sleep(delay_ms * 1000))
+        else:
+          ds = ds.apply(testing.sleep(0))
+        return ds
+
+      ds = dataset_ops.Dataset.from_tensor_slices(elements)
+      ds = ds.interleave(interleave_fn, cycle_length=10, num_parallel_calls=10)
+      opts = dataset_ops.Options()
+      opts.experimental_deterministic = False
+      ds = ds.with_options(opts)
+      ds = _make_distributed_dataset(ds, master_address)
+      return ds
+
+    self.checkDeterminism(
+        dataset_fn=dataset_fn,
+        expect_determinism=False,
+        expected_elements=elements)
+
   def run_stateful(self, external_state_policy):
     num_elements = 10
     ds = dataset_ops.Dataset.range(num_elements).map(
diff --git a/tensorflow/python/data/kernel_tests/from_generator_test.py b/tensorflow/python/data/kernel_tests/from_generator_test.py
index a08f54a7101..386108f0de7 100644
--- a/tensorflow/python/data/kernel_tests/from_generator_test.py
+++ b/tensorflow/python/data/kernel_tests/from_generator_test.py
@@ -465,8 +465,8 @@ class FromGeneratorTest(test_base.DatasetTestBase, parameterized.TestCase):
       for _ in range(10):
         yield [20]
 
-    with self.assertRaisesRegexp(
-        TypeError, r"Failed to convert '\[\[1\]\]' to a shape"):
+    with self.assertRaisesRegex(TypeError,
+                                r"Dimension value must be integer or None"):
       dataset_ops.Dataset.from_generator(
           generator, output_types=(dtypes.int64), output_shapes=[[1]])
 
diff --git a/tensorflow/python/debug/BUILD b/tensorflow/python/debug/BUILD
index 1ef0504ecb8..bb3bbbd87c4 100644
--- a/tensorflow/python/debug/BUILD
+++ b/tensorflow/python/debug/BUILD
@@ -788,7 +788,6 @@ cuda_py_test(
         "//tensorflow/python:training",
         "//tensorflow/python:variables",
         "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/keras",
         "//third_party/py/numpy",
     ],
 )
@@ -1400,7 +1399,6 @@ py_test(
         "//tensorflow/python:state_ops",
         "//tensorflow/python:training",
         "//tensorflow/python:variables",
-        "//tensorflow/python/keras",
         "//third_party/py/numpy",
     ],
 )
diff --git a/tensorflow/python/debug/lib/check_numerics_callback.py b/tensorflow/python/debug/lib/check_numerics_callback.py
index 796fabae301..bd88ec5e122 100644
--- a/tensorflow/python/debug/lib/check_numerics_callback.py
+++ b/tensorflow/python/debug/lib/check_numerics_callback.py
@@ -422,7 +422,7 @@ def enable_check_numerics(stack_height_limit=30,
   tf.debugging.enable_check_numerics()
 
   resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='')
-  strategy = tf.distribute.experimental.TPUStrategy(resolver)
+  strategy = tf.distribute.TPUStrategy(resolver)
   with strategy.scope():
     # ...
   ```
diff --git a/tensorflow/python/debug/lib/debug_events_reader.py b/tensorflow/python/debug/lib/debug_events_reader.py
index 743cea7103a..915c65f7594 100644
--- a/tensorflow/python/debug/lib/debug_events_reader.py
+++ b/tensorflow/python/debug/lib/debug_events_reader.py
@@ -46,28 +46,37 @@ class DebugEventsReader(object):
   # penalty.
   _READER_RELEASE_PER = 100
 
+  _METADATA_SUFFIX = ".metadata"
+  _SOURCE_FILE_SUFFIX = ".source_files"
+  _STACK_FRAMES_SUFFIX = ".stack_frames"
+  _GRAPHS_SUFFIX = ".graphs"
+  _EXECUTION_SUFFIX = ".execution"
+  _GRAPH_EXECUTION_TRACES_SUFFIX = ".graph_execution_traces"
+
   def __init__(self, dump_root):
     if not file_io.is_directory(dump_root):
       raise ValueError("Specified dump_root is not a directory: %s" % dump_root)
-    metadata_paths = file_io.get_matching_files(
-        os.path.join(dump_root, "*.metadata"))
-    if not metadata_paths:
-      raise ValueError("Cannot find any metadata file in directory: %s" %
-                       dump_root)
-    elif len(metadata_paths) > 1:
-      raise ValueError(
-          "Unexpected: Found multiple (%d) metadata in directory: %s" %
-          (len(metadata_paths), dump_root))
-    self._metadata_path = compat.as_bytes(metadata_paths[0])
-    self._metadata_reader = None
+    self._dump_root = dump_root
+    self._metadata_paths = self._load_metadata_files()
 
-    prefix = metadata_paths[0][:-len(".metadata")]
-    self._source_files_path = compat.as_bytes("%s.source_files" % prefix)
-    self._stack_frames_path = compat.as_bytes("%s.stack_frames" % prefix)
-    self._graphs_path = compat.as_bytes("%s.graphs" % prefix)
-    self._execution_path = compat.as_bytes("%s.execution" % prefix)
-    self._graph_execution_traces_path = compat.as_bytes(
-        "%s.graph_execution_traces" % prefix)
+    prefixes = [
+        metadata_path[:-len(self._METADATA_SUFFIX)]
+        for metadata_path in self._metadata_paths
+    ]
+    prefix = prefixes[0]  # This is the prefix of the main file set.
+    self._source_files_path = compat.as_bytes(prefix + self._SOURCE_FILE_SUFFIX)
+    self._stack_frames_path = compat.as_bytes(prefix +
+                                              self._STACK_FRAMES_SUFFIX)
+    self._graphs_path = compat.as_bytes(prefix + self._GRAPHS_SUFFIX)
+    self._execution_path = compat.as_bytes(prefix + self._EXECUTION_SUFFIX)
+    # There can be multiple .graph_execution_trace files each belonging
+    # to a file set generated on an individual host, in the case of
+    # a distributed TensorFlow job.
+    # This is different from the other debug event files in the file set.
+    self._graph_execution_traces_paths = [
+        compat.as_bytes(prefix + self._GRAPH_EXECUTION_TRACES_SUFFIX)
+        for prefix in prefixes
+    ]
     self._readers = dict()  # A map from file path to reader.
     # A map from file path to current reading offset.
     self._reader_offsets = dict()
@@ -78,6 +87,91 @@ class DebugEventsReader(object):
 
     self._offsets = dict()
 
+  def _load_metadata_files(self):
+    """Load and parse metadata files in the dump root.
+
+    Check that all metadata files have a common tfdbg_run_id, and raise
+    a ValueError if their tfdbg_run_ids differ.
+
+    Returns:
+      A list of metadata file paths in ascending order of their starting
+        wall_time timestamp.
+    """
+
+    metadata_paths = file_io.get_matching_files(
+        os.path.join(self._dump_root, "*%s" % self._METADATA_SUFFIX))
+    if not metadata_paths:
+      raise ValueError("Cannot find any tfdbg metadata file in directory: %s" %
+                       self._dump_root)
+    wall_times = []
+    run_ids = []
+    tensorflow_versions = []
+    file_versions = []
+    for metadata_path in metadata_paths:
+      reader = tf_record.tf_record_random_reader(metadata_path)
+      try:
+        record = reader.read(0)[0]
+        debug_event = debug_event_pb2.DebugEvent.FromString(record)
+        wall_times.append(debug_event.wall_time)
+        run_ids.append(debug_event.debug_metadata.tfdbg_run_id)
+        tensorflow_versions.append(
+            debug_event.debug_metadata.tensorflow_version)
+        file_versions.append(debug_event.debug_metadata.file_version)
+      finally:
+        reader.close()
+    self._starting_wall_time = wall_times[0]
+    self._tfdbg_run_id = run_ids[0]
+    self._tensorflow_version = tensorflow_versions[0]
+    self._file_version = file_versions[0]
+    if len(metadata_paths) == 1:
+      # Fast path for a common case (only one DebugEvent file set.)
+      return metadata_paths
+
+    num_no_id = len([run_id for run_id in run_ids if not run_id])
+    if num_no_id:
+      paths_without_run_id = [
+          metadata_path
+          for metadata_path, run_id in zip(metadata_paths, run_ids)
+          if not run_id
+      ]
+      raise ValueError(
+          "Found %d tfdbg metadata files and %d of them do not "
+          "have tfdbg run ids. The metadata files without run ids are: %s" %
+          (len(run_ids), num_no_id, paths_without_run_id))
+    elif len(set(run_ids)) != 1:
+      raise ValueError(
+          "Unexpected: Found multiple (%d) tfdbg2 runs in directory %s" %
+          (len(set(run_ids)), self._dump_root))
+    # Return the metadata files in ascending order of their timestamps.
+    paths_and_timestamps = sorted(
+        zip(metadata_paths, wall_times), key=lambda t: t[1])
+    self._starting_wall_time = paths_and_timestamps[0][1]
+    return [path[0] for path in paths_and_timestamps]
+
+  def starting_wall_time(self):
+    """Get the starting timestamp of the instrumented TensorFlow program.
+
+    When there are multiple hosts (i.e., multiple tfdbg file sets), the earliest
+    timestamp among the file sets is returned. It is assumed to be the job that
+    starts first (e.g., the coordinator).
+
+    Returns:
+      Starting timestamp in seconds since the epoch, as a float.
+    """
+    return self._starting_wall_time
+
+  def tfdbg_run_id(self):
+    """Get the run ID of the instrumented TensorFlow program."""
+    return self._tfdbg_run_id
+
+  def tensorflow_version(self):
+    """Get the version string of TensorFlow that the debugged program ran on."""
+    return self._tensorflow_version
+
+  def tfdbg_file_version(self):
+    """Get the tfdbg file format version."""
+    return self._file_version
+
   def __enter__(self):
     return self
 
@@ -139,9 +233,6 @@ class DebugEventsReader(object):
           self._reader_offsets[file_path] = 0
     return self._readers[file_path]
 
-  def metadata_iterator(self):
-    return self._generic_iterator(self._metadata_path)
-
   def source_files_iterator(self):
     return self._generic_iterator(self._source_files_path)
 
@@ -193,14 +284,18 @@ class DebugEventsReader(object):
       proto_string = self._get_reader(self._execution_path).read(offset)[0]
     return debug_event_pb2.DebugEvent.FromString(proto_string)
 
-  def graph_execution_traces_iterator(self):
-    return self._generic_iterator(self._graph_execution_traces_path)
+  def graph_execution_traces_iterators(self):
+    return [
+        self._generic_iterator(path)
+        for path in self._graph_execution_traces_paths
+    ]
 
-  def read_graph_execution_traces_event(self, offset):
-    """Read DebugEvent at given offset from .graph_execution_traces file.
+  def read_graph_execution_traces_event(self, locator):
+    """Read DebugEvent at given offset from given .graph_execution_traces file.
 
     Args:
-      offset: Offset to read the DebugEvent proto from.
+      locator: A (file_index, offset) tuple that locates the DebugEvent
+        containing the graph execution trace.
 
     Returns:
       A DebugEventProto.
@@ -209,9 +304,11 @@ class DebugEventsReader(object):
       `errors.DataLossError` if offset is at a wrong location.
       `IndexError` if offset is out of range of the file.
     """
-    with self._reader_read_locks[self._graph_execution_traces_path]:
-      proto_string = self._get_reader(
-          self._graph_execution_traces_path).read(offset)[0]
+    file_index, offset = locator
+    graph_execution_traces_path = self._graph_execution_traces_paths[file_index]
+    with self._reader_read_locks[graph_execution_traces_path]:
+      proto_string = self._get_reader(graph_execution_traces_path).read(
+          offset)[0]
     return debug_event_pb2.DebugEvent.FromString(proto_string)
 
   def close(self):
@@ -227,21 +324,27 @@ class BaseDigest(object):
 
   Properties:
     wall_time: A timestamp for the digest as a `float` (unit: s).
-    offset: A offset number in the corresponding file that can be used for
-      fast random read access.
+    locator: A datum that allows tracng the digest to its original
+      location. It can be either of the two:
+       1. Bytes offset from the beginning of the file as a single integer,
+          for the case of all digests of the same kind coming from the same
+          file.
+       2. A tuple of a file index and a byte offset. This applies to case
+          in which the same type of debugger data may come from multple files,
+          e.g., graph execution traces.
   """
 
-  def __init__(self, wall_time, offset):
+  def __init__(self, wall_time, locator):
     self._wall_time = wall_time
-    self._offset = offset
+    self._locator = locator
 
   @property
   def wall_time(self):
     return self._wall_time
 
   @property
-  def offset(self):
-    return self._offset
+  def locator(self):
+    return self._locator
 
   def to_json(self):
     return {"wall_time": self.wall_time}
@@ -265,10 +368,10 @@ class ExecutionDigest(BaseDigest):
 
   def __init__(self,
                wall_time,
-               offset,
+               locator,
                op_type,
                output_tensor_device_ids=None):
-    super(ExecutionDigest, self).__init__(wall_time, offset)
+    super(ExecutionDigest, self).__init__(wall_time, locator)
     self._op_type = op_type
     self._output_tensor_device_ids = _tuple_or_none(output_tensor_device_ids)
 
@@ -332,7 +435,7 @@ class Execution(ExecutionDigest):
                debug_tensor_values=None):
     super(Execution, self).__init__(
         execution_digest.wall_time,
-        execution_digest.offset,
+        execution_digest.locator,
         execution_digest.op_type,
         output_tensor_device_ids=execution_digest.output_tensor_device_ids)
     self._host_name = host_name
@@ -556,7 +659,7 @@ class GraphOpCreationDigest(BaseDigest):
 
   def __init__(self,
                wall_time,
-               offset,
+               locator,
                graph_id,
                op_type,
                op_name,
@@ -565,7 +668,7 @@ class GraphOpCreationDigest(BaseDigest):
                stack_frame_ids,
                input_names=None,
                device_name=None):
-    super(GraphOpCreationDigest, self).__init__(wall_time, offset)
+    super(GraphOpCreationDigest, self).__init__(wall_time, locator)
     self._graph_id = graph_id
     self._op_type = op_type
     self._op_name = op_name
@@ -640,14 +743,9 @@ class GraphExecutionTraceDigest(BaseDigest):
       graph.
   """
 
-  def __init__(self,
-               wall_time,
-               offset,
-               op_type,
-               op_name,
-               output_slot,
+  def __init__(self, wall_time, locator, op_type, op_name, output_slot,
                graph_id):
-    super(GraphExecutionTraceDigest, self).__init__(wall_time, offset)
+    super(GraphExecutionTraceDigest, self).__init__(wall_time, locator)
     self._op_type = op_type
     self._op_name = op_name
     self._output_slot = output_slot
@@ -701,13 +799,13 @@ class GraphExecutionTrace(GraphExecutionTraceDigest):
                tensor_debug_mode,
                debug_tensor_value=None,
                device_name=None):
-    super(GraphExecutionTrace, self).__init__(
-        graph_execution_trace_digest.wall_time,
-        graph_execution_trace_digest.offset,
-        graph_execution_trace_digest.op_type,
-        graph_execution_trace_digest.op_name,
-        graph_execution_trace_digest.output_slot,
-        graph_execution_trace_digest.graph_id)
+    super(GraphExecutionTrace,
+          self).__init__(graph_execution_trace_digest.wall_time,
+                         graph_execution_trace_digest.locator,
+                         graph_execution_trace_digest.op_type,
+                         graph_execution_trace_digest.op_name,
+                         graph_execution_trace_digest.output_slot,
+                         graph_execution_trace_digest.graph_id)
     self._graph_ids = tuple(graph_ids)
     self._tensor_debug_mode = tensor_debug_mode
     self._debug_tensor_value = debug_tensor_value
@@ -780,17 +878,17 @@ def _parse_tensor_value(tensor_proto, return_list=False):
     return None
 
 
-def _execution_digest_from_debug_event_proto(debug_event, offset):
+def _execution_digest_from_debug_event_proto(debug_event, locator):
   """Convert a DebugEvent proto into an ExecutionDigest data object."""
   return ExecutionDigest(
       debug_event.wall_time,
-      offset,
+      locator,
       debug_event.execution.op_type,
-      output_tensor_device_ids=(
-          debug_event.execution.output_tensor_device_ids or None))
+      output_tensor_device_ids=(debug_event.execution.output_tensor_device_ids
+                                or None))
 
 
-def _execution_from_debug_event_proto(debug_event, offset):
+def _execution_from_debug_event_proto(debug_event, locator):
   """Convert a DebugEvent proto into an Execution data object."""
   execution_proto = debug_event.execution
 
@@ -806,7 +904,7 @@ def _execution_from_debug_event_proto(debug_event, offset):
       debug_tensor_values.append(
           _parse_tensor_value(tensor_proto, return_list=True))
   return Execution(
-      _execution_digest_from_debug_event_proto(debug_event, offset),
+      _execution_digest_from_debug_event_proto(debug_event, locator),
       execution_proto.code_location.host_name,
       tuple(execution_proto.code_location.stack_frame_ids),
       execution_proto.tensor_debug_mode,
@@ -832,7 +930,6 @@ class DebugDataReader(object):
 
   def __init__(self, dump_root):
     self._reader = DebugEventsReader(dump_root)
-    self._load_metadata()
 
     # TODO(cais): Implement pagination for memory constraints.
     self._execution_digests = []
@@ -858,13 +955,6 @@ class DebugDataReader(object):
   def _add_monitor(self, monitor):
     self._monitors.append(monitor)
 
-  def _load_metadata(self):
-    metadata_iter = self._reader.metadata_iterator()
-    debug_event = next(metadata_iter).debug_event
-    self._starting_wall_time = debug_event.wall_time
-    self._tensorflow_version = debug_event.debug_metadata.tensorflow_version
-    self._tfdbg_run_id = debug_event.debug_metadata.tfdbg_run_id
-
   def _load_source_files(self):
     """Incrementally read the .source_files DebugEvent file."""
     source_files_iter = self._reader.source_files_iterator()
@@ -944,37 +1034,32 @@ class DebugDataReader(object):
 
   def _load_graph_execution_traces(self):
     """Incrementally load the .graph_execution_traces file."""
-    traces_iter = self._reader.graph_execution_traces_iterator()
-    for debug_event, offset in traces_iter:
-      self._graph_execution_trace_digests.append(
-          self._graph_execution_trace_digest_from_debug_event_proto(
-              debug_event, offset))
-      if self._monitors:
-        graph_execution_trace = (
-            self._graph_execution_trace_from_debug_event_proto(
-                debug_event, offset))
-        for monitor in self._monitors:
-          monitor.on_graph_execution_trace(
-              len(self._graph_execution_trace_digests) - 1,
-              graph_execution_trace)
+    for i, traces_iter in enumerate(
+        self._reader.graph_execution_traces_iterators()):
+      for debug_event, offset in traces_iter:
+        self._graph_execution_trace_digests.append(
+            self._graph_execution_trace_digest_from_debug_event_proto(
+                debug_event, (i, offset)))
+        if self._monitors:
+          graph_execution_trace = (
+              self._graph_execution_trace_from_debug_event_proto(
+                  debug_event, (i, offset)))
+          for monitor in self._monitors:
+            monitor.on_graph_execution_trace(
+                len(self._graph_execution_trace_digests) - 1,
+                graph_execution_trace)
 
-  def _graph_execution_trace_digest_from_debug_event_proto(self,
-                                                           debug_event,
-                                                           offset):
+  def _graph_execution_trace_digest_from_debug_event_proto(
+      self, debug_event, locator):
     trace_proto = debug_event.graph_execution_trace
     op_name = trace_proto.op_name
     op_type = self._lookup_op_type(trace_proto.tfdbg_context_id, op_name)
     return GraphExecutionTraceDigest(
-        debug_event.wall_time,
-        offset,
-        op_type,
-        op_name,
+        debug_event.wall_time, locator, op_type, op_name,
         trace_proto.output_slot,
         debug_event.graph_execution_trace.tfdbg_context_id)
 
-  def _graph_execution_trace_from_debug_event_proto(self,
-                                                    debug_event,
-                                                    offset):
+  def _graph_execution_trace_from_debug_event_proto(self, debug_event, locator):
     """Convert a DebugEvent proto into a GraphExecutionTrace data object."""
     trace_proto = debug_event.graph_execution_trace
     graph_ids = [trace_proto.tfdbg_context_id]
@@ -995,7 +1080,7 @@ class DebugDataReader(object):
           trace_proto.tensor_proto, return_list=True)
     return GraphExecutionTrace(
         self._graph_execution_trace_digest_from_debug_event_proto(
-            debug_event, offset),
+            debug_event, locator),
         graph_ids=graph_ids,
         tensor_debug_mode=trace_proto.tensor_debug_mode,
         debug_tensor_value=debug_tensor_value,
@@ -1059,7 +1144,7 @@ class DebugDataReader(object):
     Returns:
       Stating wall time as seconds since the epoch, as a `float`.
     """
-    return self._starting_wall_time
+    return self._reader.starting_wall_time()
 
   def tensorflow_version(self):
     """TensorFlow version used in the debugged TensorFlow program.
@@ -1070,11 +1155,11 @@ class DebugDataReader(object):
     Returns:
       TensorFlow version used by the debugged program, as a `str`.
     """
-    return self._tensorflow_version
+    return self._reader.tensorflow_version()
 
   def tfdbg_run_id(self):
     """Get the debugger run ID of the debugged TensorFlow program."""
-    return self._tfdbg_run_id
+    return self._reader.tfdbg_run_id()
 
   def outermost_graphs(self):
     """Get the number of outer most graphs read so far."""
@@ -1171,9 +1256,9 @@ class DebugDataReader(object):
 
   def read_execution(self, execution_digest):
     """Read a detailed Execution object."""
-    debug_event = self._reader.read_execution_event(execution_digest.offset)
-    return _execution_from_debug_event_proto(
-        debug_event, execution_digest.offset)
+    debug_event = self._reader.read_execution_event(execution_digest.locator)
+    return _execution_from_debug_event_proto(debug_event,
+                                             execution_digest.locator)
 
   def read_graph_execution_trace(self, graph_execution_trace_digest):
     """Read the detailed graph execution trace.
@@ -1185,9 +1270,9 @@ class DebugDataReader(object):
       The corresponding `GraphExecutionTrace` object.
     """
     debug_event = self._reader.read_graph_execution_traces_event(
-        graph_execution_trace_digest.offset)
+        graph_execution_trace_digest.locator)
     return self._graph_execution_trace_from_debug_event_proto(
-        debug_event, graph_execution_trace_digest.offset)
+        debug_event, graph_execution_trace_digest.locator)
 
   def read_execution_stack_trace(self, execution):
     """Read the stack trace of a given Execution object.
@@ -1234,7 +1319,7 @@ class DebugDataReader(object):
       A list of numpy arrays representing the output tensor values of the
         execution event.
     """
-    debug_event = self._reader.read_execution_event(execution.offset)
+    debug_event = self._reader.read_execution_event(execution.locator)
     return [_parse_tensor_value(tensor_proto)
             for tensor_proto in debug_event.execution.tensor_protos]
 
@@ -1248,8 +1333,7 @@ class DebugDataReader(object):
       A numpy array representing the output tensor value of the intra-graph
         tensor execution event.
     """
-    debug_event = self._reader.read_graph_execution_traces_event(
-        trace.offset)
+    debug_event = self._reader.read_graph_execution_traces_event(trace.locator)
     return _parse_tensor_value(debug_event.graph_execution_trace.tensor_proto)
 
   def symbolic_tensor_id(self, graph_id, op_name, output_slot):
diff --git a/tensorflow/python/debug/lib/debug_events_writer_test.py b/tensorflow/python/debug/lib/debug_events_writer_test.py
index 7b06bf772be..3f3f9179e5d 100644
--- a/tensorflow/python/debug/lib/debug_events_writer_test.py
+++ b/tensorflow/python/debug/lib/debug_events_writer_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import glob
 import json as json_lib
 import os
+import re
 import threading
 import time
 
@@ -264,14 +265,14 @@ class DebugEventsWriterTest(dumping_callback_test_lib.DumpingCallbackTestBase,
       writer.WriteGraphExecutionTrace(trace)
 
     with debug_events_reader.DebugEventsReader(self.dump_root) as reader:
-      actuals = list(reader.graph_execution_traces_iterator())
+      actuals = list(reader.graph_execution_traces_iterators()[0])
       # Before FlushExecutionFiles() is called. No data should have been written
       # to the file.
       self.assertEmpty(actuals)
 
       writer.FlushExecutionFiles()
       actuals = list(item.debug_event.graph_execution_trace
-                     for item in reader.graph_execution_traces_iterator())
+                     for item in reader.graph_execution_traces_iterators()[0])
       self.assertLen(actuals, debug_events_writer.DEFAULT_CIRCULAR_BUFFER_SIZE)
       for i in range(debug_events_writer.DEFAULT_CIRCULAR_BUFFER_SIZE):
         self.assertEqual(
@@ -291,7 +292,7 @@ class DebugEventsWriterTest(dumping_callback_test_lib.DumpingCallbackTestBase,
 
     with debug_events_reader.DebugEventsReader(self.dump_root) as reader:
       actuals = list(item.debug_event.graph_execution_trace
-                     for item in reader.graph_execution_traces_iterator())
+                     for item in reader.graph_execution_traces_iterators()[0])
     self.assertLen(actuals, num_execution_events)
     for i in range(num_execution_events):
       self.assertEqual(actuals[i].op_name, "Op%d" % i)
@@ -598,6 +599,86 @@ class DebugEventsWriterTest(dumping_callback_test_lib.DumpingCallbackTestBase,
     self.assertEqual(traces[-1].op_name, "Op_%d" % (expected_end - 1))
 
 
+class MultiSetReaderTest(dumping_callback_test_lib.DumpingCallbackTestBase):
+  """Test for DebugDataReader for multiple file sets under a dump root."""
+
+  def testReadingTwoFileSetsWithTheSameDumpRootSucceeds(self):
+    # To simulate a multi-host data dump, we first generate file sets in two
+    # different directories, with the same tfdbg_run_id, and then combine them.
+    tfdbg_run_id = "foo"
+    for i in range(2):
+      writer = debug_events_writer.DebugEventsWriter(
+          os.path.join(self.dump_root, str(i)),
+          tfdbg_run_id,
+          circular_buffer_size=-1)
+      if i == 0:
+        debugged_graph = debug_event_pb2.DebuggedGraph(
+            graph_id="graph1", graph_name="graph1")
+        writer.WriteDebuggedGraph(debugged_graph)
+        op_name = "Op_0"
+        graph_op_creation = debug_event_pb2.GraphOpCreation(
+            op_type="FooOp", op_name=op_name, graph_id="graph1")
+        writer.WriteGraphOpCreation(graph_op_creation)
+        op_name = "Op_1"
+        graph_op_creation = debug_event_pb2.GraphOpCreation(
+            op_type="FooOp", op_name=op_name, graph_id="graph1")
+        writer.WriteGraphOpCreation(graph_op_creation)
+      for _ in range(10):
+        trace = debug_event_pb2.GraphExecutionTrace(
+            op_name="Op_%d" % i, tfdbg_context_id="graph1")
+        writer.WriteGraphExecutionTrace(trace)
+        writer.FlushNonExecutionFiles()
+        writer.FlushExecutionFiles()
+
+    # Move all files from the subdirectory /1 to subdirectory /0.
+    dump_root_0 = os.path.join(self.dump_root, "0")
+    src_paths = glob.glob(os.path.join(self.dump_root, "1", "*"))
+    for src_path in src_paths:
+      dst_path = os.path.join(
+          dump_root_0,
+          # Rename the file set to avoid file name collision.
+          re.sub(r"(tfdbg_events\.\d+)", r"\g<1>1", os.path.basename(src_path)))
+      os.rename(src_path, dst_path)
+
+    with debug_events_reader.DebugDataReader(dump_root_0) as reader:
+      reader.update()
+      # Verify the content of the .graph_execution_traces file.
+      trace_digests = reader.graph_execution_traces(digest=True)
+      self.assertLen(trace_digests, 20)
+      for _ in range(10):
+        trace = reader.read_graph_execution_trace(trace_digests[i])
+        self.assertEqual(trace.op_name, "Op_0")
+      for _ in range(10):
+        trace = reader.read_graph_execution_trace(trace_digests[i + 10])
+        self.assertEqual(trace.op_name, "Op_1")
+
+  def testReadingTwoFileSetsWithTheDifferentRootsLeadsToError(self):
+    # To simulate a multi-host data dump, we first generate file sets in two
+    # different directories, with different tfdbg_run_ids, and then combine
+    # them.
+    for i in range(2):
+      writer = debug_events_writer.DebugEventsWriter(
+          os.path.join(self.dump_root, str(i)),
+          "run_id_%d" % i,
+          circular_buffer_size=-1)
+      writer.FlushNonExecutionFiles()
+      writer.FlushExecutionFiles()
+
+    # Move all files from the subdirectory /1 to subdirectory /0.
+    dump_root_0 = os.path.join(self.dump_root, "0")
+    src_paths = glob.glob(os.path.join(self.dump_root, "1", "*"))
+    for src_path in src_paths:
+      dst_path = os.path.join(
+          dump_root_0,
+          # Rename the file set to avoid file name collision.
+          re.sub(r"(tfdbg_events\.\d+)", r"\g<1>1", os.path.basename(src_path)))
+      os.rename(src_path, dst_path)
+
+    with self.assertRaisesRegexp(ValueError,
+                                 r"Found multiple \(2\) tfdbg2 runs"):
+      debug_events_reader.DebugDataReader(dump_root_0)
+
+
 class DataObjectsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 
   def jsonRoundTripCheck(self, obj):
diff --git a/tensorflow/python/debug/lib/debug_v2_ops_test.py b/tensorflow/python/debug/lib/debug_v2_ops_test.py
index d715869f359..d70c505d3fc 100644
--- a/tensorflow/python/debug/lib/debug_v2_ops_test.py
+++ b/tensorflow/python/debug/lib/debug_v2_ops_test.py
@@ -92,16 +92,13 @@ class DebugIdentityV2OpTest(dumping_callback_test_lib.DumpingCallbackTestBase):
           write_debug_trace(x), [9.0 + np.sqrt(3.0), 16.0 + 2.0])
 
     with debug_events_reader.DebugEventsReader(self.dump_root) as reader:
-      metadata_iter = reader.metadata_iterator()
       # Check that the .metadata DebugEvents data file has been created, even
       # before FlushExecutionFiles() is called.
-      debug_event = next(metadata_iter).debug_event
-      self.assertGreater(debug_event.wall_time, 0)
-      self.assertTrue(debug_event.debug_metadata.tensorflow_version)
-      self.assertTrue(
-          debug_event.debug_metadata.file_version.startswith("debug.Event:"))
+      self.assertGreater(reader.starting_wall_time(), 0)
+      self.assertTrue(reader.tensorflow_version())
+      self.assertTrue(reader.tfdbg_file_version().startswith("debug.Event"))
 
-      graph_trace_iter = reader.graph_execution_traces_iterator()
+      graph_trace_iter = reader.graph_execution_traces_iterators()[0]
       # Before FlushExecutionFiles() is called, the .graph_execution_traces file
       # ought to be empty.
       with self.assertRaises(StopIteration):
@@ -109,7 +106,7 @@ class DebugIdentityV2OpTest(dumping_callback_test_lib.DumpingCallbackTestBase):
 
       # Flush the circular buffer.
       self.writer.FlushExecutionFiles()
-      graph_trace_iter = reader.graph_execution_traces_iterator()
+      graph_trace_iter = reader.graph_execution_traces_iterators()[0]
 
       # The circular buffer has a size of 4. So only the data from the
       # last two iterations should have been written to self.dump_root.
@@ -167,7 +164,7 @@ class DebugIdentityV2OpTest(dumping_callback_test_lib.DumpingCallbackTestBase):
 
     self.writer.FlushExecutionFiles()
     with debug_events_reader.DebugEventsReader(self.dump_root) as reader:
-      graph_trace_iter = reader.graph_execution_traces_iterator()
+      graph_trace_iter = reader.graph_execution_traces_iterators()[0]
       try:
         x_values = []
         timestamp = 0
@@ -216,7 +213,7 @@ class DebugIdentityV2OpTest(dumping_callback_test_lib.DumpingCallbackTestBase):
 
     for debug_root in (self.dump_root, another_dump_root):
       with debug_events_reader.DebugEventsReader(debug_root) as reader:
-        graph_trace_iter = reader.graph_execution_traces_iterator()
+        graph_trace_iter = reader.graph_execution_traces_iterators()[0]
 
         debug_event = next(graph_trace_iter).debug_event
         trace = debug_event.graph_execution_trace
@@ -272,7 +269,7 @@ class DebugIdentityV2OpUninitializedWriterTest(
     writer.FlushExecutionFiles()
 
     with debug_events_reader.DebugEventsReader(self.dump_root) as reader:
-      graph_trace_iter = reader.graph_execution_traces_iterator()
+      graph_trace_iter = reader.graph_execution_traces_iterators()[0]
       graph_execution_traces = []
       while True:
         try:
diff --git a/tensorflow/python/debug/lib/dumping_callback.py b/tensorflow/python/debug/lib/dumping_callback.py
index 563b52f8f63..56de65d2339 100644
--- a/tensorflow/python/debug/lib/dumping_callback.py
+++ b/tensorflow/python/debug/lib/dumping_callback.py
@@ -737,7 +737,7 @@ def enable_dump_debug_info(dump_root,
       logdir, tensor_debug_mode="FULL_HEALTH")
 
   resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='')
-  strategy = tf.distribute.experimental.TPUStrategy(resolver)
+  strategy = tf.distribute.TPUStrategy(resolver)
   with strategy.scope():
     # ...
   ```
diff --git a/tensorflow/python/debug/lib/dumping_callback_test_lib.py b/tensorflow/python/debug/lib/dumping_callback_test_lib.py
index 05bf3aeb6da..e58ffdbd79f 100644
--- a/tensorflow/python/debug/lib/dumping_callback_test_lib.py
+++ b/tensorflow/python/debug/lib/dumping_callback_test_lib.py
@@ -48,7 +48,6 @@ class DumpingCallbackTestBase(test_util.TensorFlowTestCase):
   def _readAndCheckMetadataFile(self):
     """Read and check the .metadata debug-events file."""
     with debug_events_reader.DebugEventsReader(self.dump_root) as reader:
-      metadata_iter = reader.metadata_iterator()
-      metadata = next(metadata_iter).debug_event.debug_metadata
-      self.assertEqual(metadata.tensorflow_version, versions.__version__)
-      self.assertTrue(metadata.file_version.startswith("debug.Event"))
+      self.assertTrue(reader.tfdbg_run_id())
+      self.assertEqual(reader.tensorflow_version(), versions.__version__)
+      self.assertTrue(reader.tfdbg_file_version().startswith("debug.Event"))
diff --git a/tensorflow/python/debug/wrappers/local_cli_wrapper_test.py b/tensorflow/python/debug/wrappers/local_cli_wrapper_test.py
index 30bb99387b2..ab33a4af030 100644
--- a/tensorflow/python/debug/wrappers/local_cli_wrapper_test.py
+++ b/tensorflow/python/debug/wrappers/local_cli_wrapper_test.py
@@ -22,10 +22,10 @@ import tempfile
 
 import numpy as np
 
-from tensorflow.python.debug.cli import cli_config
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.python.client import session
+from tensorflow.python.debug.cli import cli_config
 from tensorflow.python.debug.cli import cli_shared
 from tensorflow.python.debug.cli import debugger_cli_common
 from tensorflow.python.debug.cli import ui_factory
@@ -36,9 +36,6 @@ from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.lib.io import file_io
-from tensorflow.python.keras import backend
-from tensorflow.python.keras.engine import sequential
-from tensorflow.python.keras.layers import core
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
@@ -832,40 +829,6 @@ class LocalCLIDebugWrapperSessionTest(test_util.TensorFlowTestCase):
     run_output = wrapped_sess.run([])
     self.assertEqual([], run_output)
 
-  def testDebuggingKerasFitWithSkippedRunsWorks(self):
-    wrapped_sess = LocalCLIDebuggerWrapperSessionForTest(
-        [["run"], ["run"], ["run", "-t", "10"]], self.sess)
-
-    backend.set_session(wrapped_sess)
-
-    model = sequential.Sequential()
-    model.add(core.Dense(4, input_shape=[2], activation="relu"))
-    model.add(core.Dense(1))
-    model.compile(loss="mse", optimizer="sgd")
-
-    x = np.zeros([8, 2])
-    y = np.zeros([8, 1])
-    model.fit(x, y, epochs=2)
-
-    self.assertEqual(2, len(wrapped_sess.observers["debug_dumps"]))
-
-  def testDebuggingKerasFitWithProfilingWorks(self):
-    wrapped_sess = LocalCLIDebuggerWrapperSessionForTest(
-        [["run", "-p"]] * 10, self.sess)
-
-    backend.set_session(wrapped_sess)
-
-    model = sequential.Sequential()
-    model.add(core.Dense(4, input_shape=[2], activation="relu"))
-    model.add(core.Dense(1))
-    model.compile(loss="mse", optimizer="sgd")
-
-    x = np.zeros([8, 2])
-    y = np.zeros([8, 1])
-    model.fit(x, y, epochs=2)
-
-    self.assertEqual(0, len(wrapped_sess.observers["debug_dumps"]))
-
   def testRunsWithEmptyNestedFetchWorks(self):
     wrapped_sess = LocalCLIDebuggerWrapperSessionForTest(
         [["run"]], self.sess, dump_root="")
diff --git a/tensorflow/python/distribute/BUILD b/tensorflow/python/distribute/BUILD
index 96559a9a740..85ee8de5635 100644
--- a/tensorflow/python/distribute/BUILD
+++ b/tensorflow/python/distribute/BUILD
@@ -204,6 +204,7 @@ py_test(
         "//tensorflow/python:variables",
         "//tensorflow/python/autograph/core:test_lib",
         "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/distribute/cluster_resolver:cluster_resolver_lib",
         "//third_party/py/numpy",
     ],
 )
@@ -654,7 +655,7 @@ tpu_py_test(
         "//tensorflow/python/distribute/cluster_resolver:cluster_resolver_lib",
         "//tensorflow/python/eager:remote",
         "//tensorflow/python/eager:test",
-        "//tensorflow/python/keras",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -744,14 +745,12 @@ py_library(
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:math_ops",
-        "//tensorflow/python:tensor_util",
+        "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python:tf_export",
         "//tensorflow/python:type_spec",
-        "//tensorflow/python:util",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
         "//tensorflow/python/eager:context",
-        "//tensorflow/python/eager:tape",
         "//tensorflow/python/training/saving:saveable_object",
         "//tensorflow/python/training/saving:saveable_object_util",
         "//tensorflow/python/training/tracking:base",
@@ -790,6 +789,7 @@ py_library(
     name = "tpu_values",
     srcs = ["tpu_values.py"],
     deps = [
+        ":packed_distributed_variable",
         ":values",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:math_ops",
@@ -1487,7 +1487,6 @@ cuda_py_test(
         "//tensorflow/python/autograph/core:test_lib",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:test",
-        "//tensorflow/python/keras/layers",
     ],
 )
 
@@ -1513,7 +1512,6 @@ cuda_py_test(
         "//tensorflow/python:variable_scope",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:test",
-        "//tensorflow/python/keras/layers",
     ],
 )
 
@@ -1605,7 +1603,7 @@ distribute_py_test(
     srcs = ["saved_model_save_load_test.py"],
     full_precision = True,
     main = "saved_model_save_load_test.py",
-    shard_count = 5,
+    shard_count = 7,
     tags = [
         "multi_and_single_gpu",
         "no_rocm",
@@ -1638,7 +1636,7 @@ distribute_py_test(
     srcs = ["saved_model_mixed_api_test.py"],
     full_precision = True,
     main = "saved_model_mixed_api_test.py",
-    shard_count = 5,
+    shard_count = 7,
     tags = [
         "multi_and_single_gpu",
         "no_rocm",
@@ -1797,7 +1795,7 @@ py_test(
     name = "multi_process_runner_test",
     srcs = ["multi_process_runner_test.py"],
     python_version = "PY3",
-    tags = ["notsan"],  # TODO(b/158874970)
+    shard_count = 12,
     deps = [
         ":multi_process_runner",
         ":multi_worker_test_base",
@@ -1850,10 +1848,11 @@ py_test(
     ],
 )
 
-cuda_py_test(
+distribute_py_test(
     name = "strategy_common_test",
     srcs = ["strategy_common_test.py"],
     python_version = "PY3",
+    shard_count = 12,
     tags = [
         "multi_and_single_gpu",
         # TODO(b/155301154): Enable this test on multi-gpu guitar once multi process
@@ -1862,6 +1861,7 @@ cuda_py_test(
     ],
     xla_enable_strict_auto_jit = True,
     deps = [
+        ":collective_all_reduce_strategy",
         ":combinations",
         ":multi_worker_test_base",
         ":reduce_util",
diff --git a/tensorflow/python/distribute/README.md b/tensorflow/python/distribute/README.md
index f44a4ee8531..26a62dacb30 100644
--- a/tensorflow/python/distribute/README.md
+++ b/tensorflow/python/distribute/README.md
@@ -49,7 +49,7 @@ model.evaluate(dataset)
 
 ```python
 # Create the strategy instance.
-tpu_strategy = tf.distribute.experimental.TPUStrategy(resolver)
+tpu_strategy = tf.distribute.TPUStrategy(resolver)
 
 
 # Create the keras model under strategy.scope()
diff --git a/tensorflow/python/distribute/checkpointing_test.py b/tensorflow/python/distribute/checkpointing_test.py
index ad646905315..edd4c46c371 100644
--- a/tensorflow/python/distribute/checkpointing_test.py
+++ b/tensorflow/python/distribute/checkpointing_test.py
@@ -103,6 +103,7 @@ class TrainingCheckpointTests(test.TestCase, parameterized.TestCase):
               strategy_combinations.mirrored_strategy_with_one_cpu,
               strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
               strategy_combinations.tpu_strategy,
+              strategy_combinations.tpu_strategy_packed_var,
               strategy_combinations.central_storage_strategy_with_two_gpus,
           ],
           mode=["eager"]))
@@ -138,6 +139,7 @@ class TrainingCheckpointTests(test.TestCase, parameterized.TestCase):
               strategy_combinations.mirrored_strategy_with_one_cpu,
               strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
               strategy_combinations.tpu_strategy,
+              strategy_combinations.tpu_strategy_packed_var,
               strategy_combinations.central_storage_strategy_with_two_gpus,
           ],
           mode=["eager"]))
diff --git a/tensorflow/python/distribute/cluster_resolver/cluster_resolver.py b/tensorflow/python/distribute/cluster_resolver/cluster_resolver.py
index a8babc21af6..f43bfc9845f 100644
--- a/tensorflow/python/distribute/cluster_resolver/cluster_resolver.py
+++ b/tensorflow/python/distribute/cluster_resolver/cluster_resolver.py
@@ -71,11 +71,22 @@ class ClusterResolver(object):
   workers. This will eventually allow us to automatically recover from
   underlying machine failures and scale TensorFlow worker clusters up and down.
 
-  Note to Implementors: In addition to these abstract methods, you must also
-  implement the task_type, task_id, and rpc_layer attributes. You may choose
-  to implement them either as properties with getters or setters or directly
-  set the attributes. The task_type and task_id attributes are required by
-  `tf.distribute.experimental.MultiWorkerMirroredStrategy`.
+  Note to Implementors of `tf.distribute.cluster_resolver.ClusterResolver`
+  subclass: In addition to these abstract methods, when task_type, task_id, and
+  rpc_layer attributes are applicable, you should also implement them either as
+  properties with getters or setters, or directly set the attributes
+  `self._task_type`, `self._task_id`, or `self._rpc_layer` so the base class'
+  getters and setters are used. See
+  `tf.distribute.cluster_resolver.SimpleClusterResolver.__init__` for an
+  example.
+
+  In general, multi-client tf.distribute strategies such as
+  `tf.distribute.experimental.MultiWorkerMirroredStrategy` require task_type and
+  task_id properties to be available in the `ClusterResolver` they are using. On
+  the other hand, these concepts are not applicable in single-client strategies,
+  such as `tf.distribute.experimental.TPUStrategy`, because the program is only
+  expected to be run on one task, so there should not be a need to have code
+  branches according to task type and task id.
 
   - task_type is the name of the server's current named job (e.g. 'worker',
      'ps' in a distributed parameterized training job).
@@ -177,6 +188,106 @@ class ClusterResolver(object):
     """
     return ''
 
+  @property
+  def task_type(self):
+    """Returns the task type this `ClusterResolver` indicates.
+
+    In TensorFlow distributed environment, each job may have an applicable
+    task type. Valid task types in TensorFlow include
+    'chief': a worker that is designated with more responsibility,
+    'worker': a regular worker for training/evaluation,
+    'ps': a parameter server, or
+    'evaluator': an evaluator that evaluates the checkpoints for metrics.
+
+    See [Multi-worker configuration](
+    https://www.tensorflow.org/tutorials/distribute/multi_worker_with_keras#multi-worker_configuration)
+    for more information about 'chief' and 'worker' task type, which are most
+    commonly used.
+
+    Having access to such information is useful when user needs to run specific
+    code according to task types. For example,
+
+    ```python
+    cluster_spec = tf.train.ClusterSpec({
+        "ps": ["localhost:2222", "localhost:2223"],
+        "worker": ["localhost:2224", "localhost:2225", "localhost:2226"]
+    })
+
+    # SimpleClusterResolver is used here for illustration; other cluster
+    # resolvers may be used for other source of task type/id.
+    simple_resolver = SimpleClusterResolver(cluster_spec, task_type="worker",
+                                            task_id=1)
+
+    ...
+
+    if cluster_resolver.task_type == 'worker':
+      # Perform something that's only applicable on workers. This block
+      # will run on this particular instance since we've specified this task to
+      # be a worker in above cluster resolver.
+    elif cluster_resolver.task_type == 'ps':
+      # Perform something that's only applicable on parameter servers. This
+      # block will not run on this particular instance.
+    ```
+
+    Returns `None` if such information is not available or is not applicable
+    in the current distributed environment, such as training with
+    `tf.distribute.experimental.TPUStrategy`.
+
+    For more information, please see
+    `tf.distribute.cluster_resolver.ClusterResolver`'s class doc.
+    """
+    return getattr(self, '_task_type', None)
+
+  @property
+  def task_id(self):
+    """Returns the task id this `ClusterResolver` indicates.
+
+    In TensorFlow distributed environment, each job may have an applicable
+    task id, which is the index of the instance within its task type. This is
+    useful when user needs to run specific code according to task index. For
+    example,
+
+    ```python
+    cluster_spec = tf.train.ClusterSpec({
+        "ps": ["localhost:2222", "localhost:2223"],
+        "worker": ["localhost:2224", "localhost:2225", "localhost:2226"]
+    })
+
+    # SimpleClusterResolver is used here for illustration; other cluster
+    # resolvers may be used for other source of task type/id.
+    simple_resolver = SimpleClusterResolver(cluster_spec, task_type="worker",
+                                            task_id=0)
+
+    ...
+
+    if cluster_resolver.task_type == 'worker' and cluster_resolver.task_id == 0:
+      # Perform something that's only applicable on 'worker' type, id 0. This
+      # block will run on this particular instance since we've specified this
+      # task to be a 'worker', id 0 in above cluster resolver.
+    else:
+      # Perform something that's only applicable on other ids. This block will
+      # not run on this particular instance.
+    ```
+
+    Returns `None` if such information is not available or is not applicable
+    in the current distributed environment, such as training with
+    `tf.distribute.cluster_resolver.TPUClusterResolver`.
+
+    For more information, please see
+    `tf.distribute.cluster_resolver.ClusterResolver`'s class docstring.
+    """
+    return getattr(self, '_task_id', None)
+
+  @task_type.setter
+  def task_type(self, task_type):
+    """Setter of `task_type` property. See `task_type` property doc."""
+    self._task_type = task_type
+
+  @task_id.setter
+  def task_id(self, task_id):
+    """Setter of `task_id` property. See `task_type` property doc."""
+    self._task_id = task_id
+
 
 @tf_export('distribute.cluster_resolver.SimpleClusterResolver')
 class SimpleClusterResolver(ClusterResolver):
diff --git a/tensorflow/python/distribute/cluster_resolver/gce_cluster_resolver_test.py b/tensorflow/python/distribute/cluster_resolver/gce_cluster_resolver_test.py
index f39c86a0495..d8037497cb9 100644
--- a/tensorflow/python/distribute/cluster_resolver/gce_cluster_resolver_test.py
+++ b/tensorflow/python/distribute/cluster_resolver/gce_cluster_resolver_test.py
@@ -310,5 +310,37 @@ class GCEClusterResolverTest(test.TestCase):
     """
     self._verifyClusterSpecEquality(actual_cluster_spec, expected_proto)
 
+  def testSettingTaskTypeRaiseError(self):
+    name_to_ip = [
+        {
+            'name': 'instance1',
+            'ip': '10.1.2.3'
+        },
+        {
+            'name': 'instance2',
+            'ip': '10.2.3.4'
+        },
+        {
+            'name': 'instance3',
+            'ip': '10.3.4.5'
+        },
+    ]
+
+    gce_cluster_resolver = GCEClusterResolver(
+        project='test-project',
+        zone='us-east1-d',
+        instance_group='test-instance-group',
+        task_type='testworker',
+        port=8470,
+        credentials=None,
+        service=self.gen_standard_mock_service_client(name_to_ip))
+
+    with self.assertRaisesRegexp(
+        RuntimeError, 'You cannot reset the task_type '
+        'of the GCEClusterResolver after it has '
+        'been created.'):
+      gce_cluster_resolver.task_type = 'foobar'
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/distribute/cluster_resolver/tpu/tpu_cluster_resolver.py b/tensorflow/python/distribute/cluster_resolver/tpu/tpu_cluster_resolver.py
index 943b736fde4..e42420ec644 100644
--- a/tensorflow/python/distribute/cluster_resolver/tpu/tpu_cluster_resolver.py
+++ b/tensorflow/python/distribute/cluster_resolver/tpu/tpu_cluster_resolver.py
@@ -52,16 +52,59 @@ class TPUClusterResolver(cluster_resolver.ClusterResolver):
   """Cluster Resolver for Google Cloud TPUs.
 
   This is an implementation of cluster resolvers for the Google Cloud TPU
-  service. As Cloud TPUs are in alpha, you will need to specify a API definition
-  file for this to consume, in addition to a list of Cloud TPUs in your Google
-  Cloud Platform project.
+  service.
 
   TPUClusterResolver supports the following distinct environments:
   Google Compute Engine
   Google Kubernetes Engine
   Google internal
+
+  It can be passed into `tf.distribute.TPUStrategy` to support TF2 training on
+  Cloud TPUs.
   """
 
+  @staticmethod
+  def connect(tpu=None,
+              zone=None,
+              project=None):
+    """Initializes TPU and returns a TPUClusterResolver.
+
+    This API will connect to remote TPU cluster and initialize the TPU
+    hardwares. Example usage:
+
+    >>> resolver = tf.distribute.cluster_resolver.TPUClusterResolver.connect(
+    ...     tpu='')
+
+    It can be viewed as convenient wrapper of the following code:
+
+    >>> resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='')
+    >>> tf.config.experimental_connect_to_cluster(resolver)
+    >>> tf.tpu.experimental.initialize_tpu_system(resolver)
+
+    Args:
+      tpu: A string corresponding to the TPU to use. It can be the TPU name or
+        TPU worker gRPC address. If not set, it will try automatically resolve
+        the TPU address on Cloud TPUs.
+      zone: Zone where the TPUs are located. If omitted or empty, we will assume
+        that the zone of the TPU is the same as the zone of the GCE VM, which we
+        will try to discover from the GCE metadata service.
+      project: Name of the GCP project containing Cloud TPUs. If omitted or
+        empty, we will try to discover the project name of the GCE VM from the
+        GCE metadata service.
+
+    Returns:
+      An instance of TPUClusterResolver object.
+
+    Raises:
+      NotFoundError: If no TPU devices found in eager mode.
+    """
+    resolver = TPUClusterResolver(tpu, zone, project)
+    from tensorflow.python.eager import remote  # pylint: disable=g-import-not-at-top
+    remote.connect_to_cluster(resolver)
+    from tensorflow.python.tpu import tpu_strategy_util  # pylint: disable=g-import-not-at-top
+    tpu_strategy_util.initialize_tpu_system(resolver)
+    return resolver
+
   @staticmethod
   def _get_device_dict_and_cores(devices):
     """Returns a dict of hosts to cores and total cores given devices names.
@@ -110,12 +153,9 @@ class TPUClusterResolver(cluster_resolver.ClusterResolver):
     for the IP addresses and ports of each Cloud TPU listed.
 
     Args:
-      tpu: A string corresponding to the TPU to use. If the string is an empty
-        string, the string 'local', or a string that begins with 'grpc://', then
-          it is assumed to not correspond with a Cloud TPU and will instead be
-          passed as the session master and no ClusterSpec propagation will be
-          done. In the future, this may also support a list of strings when
-          multiple Cloud TPUs are used.
+      tpu: A string corresponding to the TPU to use. It can be the TPU name or
+        TPU worker gRPC address. If not set, it will try automatically resolve
+        the TPU address on Cloud TPUs.
       zone: Zone where the TPUs are located. If omitted or empty, we will assume
         that the zone of the TPU is the same as the zone of the GCE VM, which we
         will try to discover from the GCE metadata service.
diff --git a/tensorflow/python/distribute/collective_all_reduce_strategy.py b/tensorflow/python/distribute/collective_all_reduce_strategy.py
index f73474c4af4..b4cc90e858f 100644
--- a/tensorflow/python/distribute/collective_all_reduce_strategy.py
+++ b/tensorflow/python/distribute/collective_all_reduce_strategy.py
@@ -138,6 +138,18 @@ class CollectiveAllReduceStrategy(distribute_lib.Strategy):
     """
     return super(CollectiveAllReduceStrategy, self).scope()
 
+  @property
+  def cluster_resolver(self):
+    """Returns the cluster resolver associated with this strategy.
+
+    As a multi-worker strategy,
+    `tf.distribute.experimental.MultiWorkerMirroredStrategy` provides the
+    associated `tf.distribute.cluster_resolver.ClusterResolver`. If the user
+    provides one in `__init__`, that instance is returned; if the user does
+    not, a default `TFConfigClusterResolver` is provided.
+    """
+    return self.extended._cluster_resolver  # pylint: disable=protected-access
+
 
 @tf_export(v1=["distribute.experimental.MultiWorkerMirroredStrategy"])  # pylint: disable=missing-docstring
 class CollectiveAllReduceStrategyV1(distribute_lib.StrategyV1):
diff --git a/tensorflow/python/distribute/collective_all_reduce_strategy_test.py b/tensorflow/python/distribute/collective_all_reduce_strategy_test.py
index a9f7bc74e9e..9e36531a8db 100644
--- a/tensorflow/python/distribute/collective_all_reduce_strategy_test.py
+++ b/tensorflow/python/distribute/collective_all_reduce_strategy_test.py
@@ -500,8 +500,7 @@ class DistributedCollectiveAllReduceStrategyTest(
     self.assertEqual(['CollectiveReduce'],
                      new_rewrite_options.scoped_allocator_opts.enable_op)
 
-  @combinations.generate(combinations.combine(mode=['eager']))
-  def testEnableCollectiveOps(self):
+  def _get_strategy_with_mocked_methods(self):
     mock_called = [False]
 
     # pylint: disable=dangerous-default-value
@@ -520,9 +519,21 @@ class DistributedCollectiveAllReduceStrategyTest(
                                 mock_configure_collective_ops):
       strategy, _, _ = self._get_test_object(
           task_type='worker', task_id=1, num_gpus=2)
+
+    return strategy, mock_called
+
+  @combinations.generate(combinations.combine(mode=['eager']))
+  def testEnableCollectiveOps(self):
+    strategy, mock_called = self._get_strategy_with_mocked_methods()
     self.assertTrue(strategy.extended._std_server_started)
     self.assertTrue(mock_called[0])
 
+  @combinations.generate(combinations.combine(mode=['eager']))
+  def testEnableCollectiveOpsAndClusterResolver(self):
+    strategy, _ = self._get_strategy_with_mocked_methods()
+    self.assertEqual(strategy.cluster_resolver.task_type, 'worker')
+    self.assertEqual(strategy.cluster_resolver.task_id, 1)
+
 
 class DistributedCollectiveAllReduceStrategyTestWithChief(
     CollectiveAllReduceStrategyTestBase, parameterized.TestCase):
diff --git a/tensorflow/python/distribute/custom_training_loop_input_test.py b/tensorflow/python/distribute/custom_training_loop_input_test.py
index 5d1584f5aa7..e2c4076f3f1 100644
--- a/tensorflow/python/distribute/custom_training_loop_input_test.py
+++ b/tensorflow/python/distribute/custom_training_loop_input_test.py
@@ -30,6 +30,7 @@ from tensorflow.python.distribute import strategy_combinations
 from tensorflow.python.eager import def_function
 from tensorflow.python.eager import test
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
@@ -136,8 +137,52 @@ class InputIterationTest(test.TestCase, parameterized.TestCase,
 
   @combinations.generate(
       combinations.combine(
-          distribution=strategy_combinations.tpu_strategies,
-          mode=["eager"]))
+          distribution=strategy_combinations.all_strategies, mode=["eager"]))
+  def testGetNextAsOptional(self, distribution):
+    data = [5., 6., 7., 8.]
+    dataset = get_dataset_from_tensor_slices(data).batch(2)
+    dist_dataset = distribution.experimental_distribute_dataset(dataset)
+    iterator = iter(dist_dataset)
+
+    def train_step(data):
+      return math_ops.square(data)
+
+    @def_function.function
+    def run(iterator):
+      return distribution.experimental_local_results(
+          distribution.run(
+              train_step, args=(iterator.get_next_as_optional().get_value(),)))
+
+    self.assert_equal_flattened([[25., 36.]], [run(iterator)])
+
+  @combinations.generate(
+      combinations.combine(
+          distribution=strategy_combinations.all_strategies, mode=["eager"]))
+  def testGetNextAsOptionalExampleUsage(self, distribution):
+    global_batch_size = 2
+    steps_per_loop = 6
+    dataset = dataset_ops.Dataset.range(
+        8, output_type=dtypes.int32).batch(global_batch_size)
+    distributed_iterator = iter(
+        distribution.experimental_distribute_dataset(dataset))
+
+    @def_function.function
+    def train_fn(distributed_iterator):
+
+      def step_fn(x):
+        return x
+
+      for _ in math_ops.range(steps_per_loop):
+        optional_data = distributed_iterator.get_next_as_optional()
+        if not optional_data.has_value():
+          break
+        distribution.run(step_fn, args=(optional_data.get_value(),))
+
+    train_fn(distributed_iterator)
+
+  @combinations.generate(
+      combinations.combine(
+          distribution=strategy_combinations.tpu_strategies, mode=["eager"]))
   def testFullEagerTPU(self, distribution):
     dataset = get_dataset_from_tensor_slices([5., 6., 7., 8.]).batch(2)
 
@@ -197,7 +242,8 @@ class InputIterationTest(test.TestCase, parameterized.TestCase,
       combinations.combine(
           distribution=[
               strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
-              strategy_combinations.tpu_strategy
+              strategy_combinations.tpu_strategy,
+              strategy_combinations.tpu_strategy_packed_var,
           ],
           mode=["eager"]))
   def testNestedOutput(self, distribution):
@@ -408,10 +454,11 @@ class InputIterationTest(test.TestCase, parameterized.TestCase,
       ))
   def testDistributeDatasetHostPrefetch(self, distribution):
     data = [5., 6., 7., 8.]
-    distribution.extended._set_prefetch_on_host(True)  # pylint: disable=protected-access
     input_iterator = iter(
         distribution.experimental_distribute_dataset(
-            get_dataset_from_tensor_slices(data).batch(2)))
+            get_dataset_from_tensor_slices(data).batch(2),
+            distribute_lib.InputOptions(
+                experimental_prefetch_to_device=False)))
 
     local_results = distribution.experimental_local_results(
         input_iterator.get_next())
@@ -427,10 +474,11 @@ class InputIterationTest(test.TestCase, parameterized.TestCase,
       ))
   def testDistributeDatasetFunctionHostPrefetch(self, distribution):
     data = [5., 6., 7., 8.]
-    distribution.extended._set_prefetch_on_host(True)  # pylint: disable=protected-access
     input_iterator = iter(
         distribution.experimental_distribute_datasets_from_function(
-            lambda _: get_dataset_from_tensor_slices(data)))
+            lambda _: get_dataset_from_tensor_slices(data),
+            distribute_lib.InputOptions(
+                experimental_prefetch_to_device=False)))
 
     local_results = distribution.experimental_local_results(
         input_iterator.get_next())
diff --git a/tensorflow/python/distribute/distribute_lib.py b/tensorflow/python/distribute/distribute_lib.py
index a6dc35507e9..f32427b88e0 100644
--- a/tensorflow/python/distribute/distribute_lib.py
+++ b/tensorflow/python/distribute/distribute_lib.py
@@ -200,6 +200,7 @@ import six
 from tensorflow.python.autograph.core import ag_ctx as autograph_ctx
 from tensorflow.python.autograph.impl import api as autograph
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.distribute import collective_util
 from tensorflow.python.distribute import device_util
 from tensorflow.python.distribute import distribution_strategy_context
@@ -617,7 +618,7 @@ class InputOptions(
   resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='')
   tf.config.experimental_connect_to_cluster(resolver)
   tf.tpu.experimental.initialize_tpu_system(resolver)
-  strategy = tf.distribute.experimental.TPUStrategy(resolver)
+  strategy = tf.distribute.TPUStrategy(resolver)
 
   dataset = tf.data.Dataset.range(16)
   distributed_dataset_on_host = (
@@ -684,7 +685,8 @@ class StrategyBase(object):
         instead.
       * Use `tf.distribute.Strategy.run` to run a function
         once per replica, taking values that may be "per-replica" (e.g.
-        from a distributed dataset) and returning "per-replica" values.
+        from a `tf.distribute.DistributedDataset` object) and returning
+        "per-replica" values.
         This function is executed in "replica context", which means each
         operation is performed separately on each replica.
       * Finally use a method (such as `tf.distribute.Strategy.reduce`) to
@@ -720,7 +722,8 @@ class StrategyBase(object):
   distributed-specific behavior.
 
   You can use the `reduce` API to aggregate results across replicas and use
-  this as a return value from one iteration over the distributed dataset. Or
+  this as a return value from one iteration over a
+  `tf.distribute.DistributedDataset`. Or
   you can use `tf.keras.metrics` (such as loss, accuracy, etc.) to
   accumulate metrics across steps in a given epoch.
 
@@ -784,17 +787,96 @@ class StrategyBase(object):
     finally:
       self._scale_loss_for_estimator = False
 
+  # pylint: disable=line-too-long
   def scope(self):
-    """Returns a context manager selecting this Strategy as current.
+    """Context manager to make the strategy current and distribute variables.
 
-    Inside a `with strategy.scope():` code block, this thread
-    will use a variable creator set by `strategy`, and will
-    enter its "cross-replica context".
+    This method returns a context manager, and is used as follows:
+
+    >>> strategy = tf.distribute.MirroredStrategy()
+    >>> # Variable created inside scope:
+    >>> with strategy.scope():
+    ...   mirrored_variable = tf.Variable(1.)
+    >>> mirrored_variable
+    MirroredVariable:{
+      0: <tf.Variable 'Variable:0' shape=() dtype=float32, numpy=1.0>
+    }
+    >>> # Variable created outside scope:
+    >>> regular_variable = tf.Variable(1.)
+    >>> regular_variable
+    <tf.Variable 'Variable:0' shape=() dtype=float32, numpy=1.0>
+
+    _What happens when Strategy.scope is entered?_
+
+    * `strategy` is installed in the global context as the "current" strategy.
+      Inside this scope, `tf.distribute.get_strategy()` will now return this
+      strategy. Outside this scope, it returns the default no-op strategy.
+    * Entering the scope also enters the "cross-replica context". See
+      `tf.distribute.StrategyExtended` for an explanation on cross-replica and
+      replica contexts.
+    * Variable creation inside `scope` is intercepted by the strategy. Each
+      strategy defines how it wants to affect the variable creation. Sync
+      strategies like `MirroredStrategy`, `TPUStrategy` and
+      `MultiWorkerMiroredStrategy` create variables replicated on each replica,
+      whereas `ParameterServerStrategy` creates variables on the parameter
+      servers. This is done using a custom `tf.variable_creator_scope`.
+    * In some strategies, a default device scope may also be entered: in
+      `MultiWorkerMiroredStrategy`, a default device scope of "/CPU:0" is
+      entered on each worker.
+
+    Note: Entering a scope does not automatically distribute a computation, except
+      in the case of high level training framework like keras `model.fit`. If
+      you're not using `model.fit`, you
+      need to use `strategy.run` API to explicitly distribute that computation.
+      See an example in the [custom training loop tutorial](https://www.tensorflow.org/tutorials/distribute/custom_training).
+
+
+    _What should be in scope and what should be outside?_
+
+    There are a number of requirements on what needs to happen inside the scope.
+    However, in places where we have information about which strategy is in use,
+    we often enter the scope for the user, so they don't have to do it
+    explicitly (i.e. calling those either inside or outside the scope is OK).
+
+    * Anything that creates variables that should be distributed variables
+      must be in `strategy.scope`. This can be either by directly putting it in
+      scope, or relying on another API like `strategy.run` or `model.fit` to
+      enter it for you. Any variable that is created outside scope will not be
+      distributed and may have performance implications. Common things that
+      create variables in TF: models, optimizers, metrics. These should always
+      be created inside the scope. Another source of variable creation can be
+      a checkpoint restore - when variables are created lazily. Note that any
+      variable created inside a strategy captures the strategy information. So
+      reading and writing to these variables outside the `strategy.scope` can
+      also work seamlessly, without the user having to enter the scope.
+    * Some strategy APIs (such as `strategy.run` and `strategy.reduce`) which
+      require to be in a strategy's scope, enter the scope for you
+      automatically, which means when using those APIs you don't need to
+      enter the scope yourself.
+    * When a `tf.keras.Model` is created inside a `strategy.scope`, we capture
+      this information. When high level training frameworks methods such as
+      `model.compile`, `model.fit` etc are then called
+      on this model, we automatically enter the scope, as well as use this
+      strategy to distribute the training etc. See
+      detailed example in [distributed keras tutorial](https://www.tensorflow.org/tutorials/distribute/keras).
+      Note that simply calling the `model(..)` is not impacted - only high
+      level training framework APIs are. `model.compile`, `model.fit`,
+      `model.evaluate`, `model.predict` and `model.save` can all be called
+      inside or outside the scope.
+    * The following can be either inside or outside the scope:
+      ** Creating the input datasets
+      ** Defining `tf.function`s that represent your training step
+      ** Saving APIs such as `tf.saved_model.save`. Loading creates variables,
+         so that should go inside the scope if you want to train the model in a
+         distributed way.
+      ** Checkpoint saving. As mentioned above - `checkpoint.restore` may
+         sometimes need to be inside scope if it creates variables.
 
     Returns:
       A context manager.
     """
     return self._extended._scope(self)  # pylint: disable=protected-access
+  # pylint: enable=line-too-long
 
   @doc_controls.do_not_doc_inheritable  # DEPRECATED, moving to `extended`
   def colocate_vars_with(self, colocate_with_variable):
@@ -859,12 +941,12 @@ class StrategyBase(object):
     return self.run(fn, args=args)
 
   def experimental_distribute_dataset(self, dataset, options=None):
-    """Distributes a tf.data.Dataset instance provided via `dataset`.
+    """Creates `tf.distribute.DistributedDataset` from `tf.data.Dataset`.
 
-    The returned distributed dataset can be iterated over similar to how
-    regular datasets can.
-    NOTE: Currently, the user cannot add any more transformations to a
-    distributed dataset.
+    The returned `tf.distribute.DistributedDataset` can be iterated over
+    similar to how regular datasets can.
+    NOTE: The user cannot add any more transformations to a
+    `tf.distribute.DistributedDataset`.
 
     The following is an example:
 
@@ -878,48 +960,53 @@ class StrategyBase(object):
     # Distribute that dataset
     dist_dataset = strategy.experimental_distribute_dataset(dataset)
 
-    # Iterate over the distributed dataset
+    # Iterate over the `tf.distribute.DistributedDataset`
     for x in dist_dataset:
       # process dataset elements
       strategy.run(replica_fn, args=(x,))
     ```
 
-    In the code snippet above, the dataset `dist_dataset` is batched by
-    GLOBAL_BATCH_SIZE, and we iterate through it using `for x in dist_dataset`,
-    where x is one batch of data of GLOBAL_BATCH_SIZE containing N batches of
-    data of per-replica batch size, corresponding to N replicas.
-    `tf.distribute.Strategy.run` will take care of feeding
-    the right per-replica batch to the right `replica_fn` execution on each
+    In the code snippet above, the `tf.distribute.DistributedDataset`
+    `dist_dataset` is batched by `GLOBAL_BATCH_SIZE`, and we iterate through it
+    using `for x in dist_dataset`. `x` a `tf.distribute.DistributedValues`
+    containing data for all replicas, which aggregates to a batch of
+    `GLOBAL_BATCH_SIZE`. `tf.distribute.Strategy.run` will take care of feeding
+    the right per-replica data in `x` to the right `replica_fn` executed on each
     replica.
 
-    In a multi-worker setting, we will first attempt to distribute the dataset
-    by attempting to detect whether the dataset is being created out of
-    ReaderDatasets (e.g. TFRecordDataset, TextLineDataset, etc.) and if so,
-    attempting to shard the input files. Note that there has to be at least one
-    input file per worker. If you have less than one input file per worker, we
-    suggest that you should disable distributing your dataset using the method
-    below.
+    What's under the hood of this method, when we say the `tf.data.Dataset`
+    instance - `dataset` - gets distributed? It depends on how you set the
+    `tf.data.experimental.AutoShardPolicy` through
+    `tf.data.experimental.DistributeOptions`. By default, it is set to
+    `tf.data.experimental.AutoShardPolicy.AUTO`. In a multi-worker setting, we
+    will first attempt to distribute `dataset` by detecting whether `dataset` is
+    being created out of reader datasets (e.g. `tf.data.TFRecordDataset`,
+    `tf.data.TextLineDataset`, etc.) and if so, try to shard the input files.
+    Note that there has to be at least one input file per worker. If you have
+    less than one input file per worker, we suggest that you disable dataset
+    sharding across workers, by setting the
+    `tf.data.experimental.DistributeOptions.auto_shard_policy` to be
+    `tf.data.experimental.AutoShardPolicy.OFF`.
 
-    If that attempt is unsuccessful (e.g. the dataset is created from a
-    Dataset.range), we will shard the dataset evenly at the end by appending a
-    `.shard` operation to the end of the processing pipeline. This will cause
-    the entire preprocessing pipeline for all the data to be run on every
-    worker, and each worker will do redundant work. We will print a warning
-    if this method of sharding is selected.
+    If the attempt to shard by file is unsuccessful (i.e. the dataset is not
+    read from files), we will shard the dataset evenly at the end by
+    appending a `.shard` operation to the end of the processing pipeline. This
+    will cause the entire preprocessing pipeline for all the data to be run on
+    every worker, and each worker will do redundant work. We will print a
+    warning if this route is selected.
 
-    You can disable dataset sharding across workers using the
-    `auto_shard_policy` option in `tf.data.experimental.DistributeOptions`.
-
-    Within each worker, we will also split the data among all the worker
-    devices (if more than one a present), and this will happen even if
-    multi-worker sharding is disabled using the method above.
+    As mentioned before, within each worker, we will also split the data among
+    all the worker devices (if more than one a present). This will happen
+    even if multi-worker sharding is disabled.
 
     If the above batch splitting and dataset sharding logic is undesirable,
-    please use `experimental_distribute_datasets_from_function` instead, which
-    does not do any automatic splitting or sharding.
+    please use
+    `tf.distribute.Strategy.experimental_distribute_datasets_from_function`
+    instead, which does not do any automatic splitting or sharding.
 
-    You can also use the `element_spec` property of the distributed dataset
-    returned by this API to query the `tf.TypeSpec` of the elements returned
+    You can also use the `element_spec` property of the
+    `tf.distribute.DistributedDataset` instance returned by this API to query
+    the `tf.TypeSpec` of the elements returned
     by the iterator. This can be used to set the `input_signature` property
     of a `tf.function`.
 
@@ -938,12 +1025,21 @@ class StrategyBase(object):
       # train model with inputs
       return
 
-    # Iterate over the distributed dataset
+    # Iterate over the `tf.distribute.DistributedDataset`
     for x in dist_dataset:
       # process dataset elements
       strategy.run(train_step, args=(x,))
     ```
 
+    Note: The order in which the data is processed by the workers when using
+    `tf.distribute.Strategy.experimental_distribute_dataset` or
+    `tf.distribute.Strategy.experimental_distribute_datasets_from_function` is
+    not guaranteed. This is typically required if you are using
+    `tf.distribute` to scale prediction. You can however insert an index for
+    each element in the batch and order outputs accordingly. Refer to [this
+    snippet](https://www.tensorflow.org/tutorials/distribute/input#caveats)
+    for an example of how to order outputs.
+
     Args:
       dataset: `tf.data.Dataset` that will be sharded across all replicas using
         the rules stated above.
@@ -951,8 +1047,7 @@ class StrategyBase(object):
         dataset is distributed.
 
     Returns:
-      A "distributed `Dataset`", which acts like a `tf.data.Dataset` except
-      it produces "per-replica" values.
+      A `tf.distribute.DistributedDataset`.
     """
     return self._extended._experimental_distribute_dataset(dataset, options)  # pylint: disable=protected-access
 
@@ -978,10 +1073,10 @@ class StrategyBase(object):
     The `dataset_fn` should take an `tf.distribute.InputContext` instance where
     information about batching and input replication can be accessed.
 
-    You can also use the `element_spec` property of the distributed dataset
-    returned by this API to query the `tf.TypeSpec` of the elements returned
-    by the iterator. This can be used to set the `input_signature` property
-    of a `tf.function`.
+    You can also use the `element_spec` property of the
+    `tf.distribute.DistributedDataset` returned by this API to query the
+    `tf.TypeSpec` of the elements returned by the iterator. This can be used to
+    set the `input_signature` property of a `tf.function`.
 
     >>> global_batch_size = 8
     >>> def dataset_fn(input_context):
@@ -1010,6 +1105,16 @@ class StrategyBase(object):
     the global batch size.  This may be computed using
     `input_context.get_per_replica_batch_size`.
 
+
+    Note: The order in which the data is processed by the workers when using
+    `tf.distribute.Strategy.experimental_distribute_dataset` or
+    `tf.distribute.Strategy.experimental_distribute_datasets_from_function` is
+    not guaranteed. This is typically required if you are using
+    `tf.distribute` to scale prediction. You can however insert an index for
+    each element in the batch and order outputs accordingly. Refer to [this
+    snippet](https://www.tensorflow.org/tutorials/distribute/input#caveats)
+    for an example of how to order outputs.
+
     Args:
       dataset_fn: A function taking a `tf.distribute.InputContext` instance and
         returning a `tf.data.Dataset`.
@@ -1017,8 +1122,7 @@ class StrategyBase(object):
         dataset is distributed.
 
     Returns:
-      A "distributed `Dataset`", which acts like a `tf.data.Dataset` except
-      it produces "per-replica" values.
+      A `tf.distribute.DistributedDataset`.
     """
     return self._extended._experimental_distribute_datasets_from_function(  # pylint: disable=protected-access
         dataset_fn, options)
@@ -1028,7 +1132,9 @@ class StrategyBase(object):
 
     Executes ops specified by `fn` on each replica. If `args` or `kwargs` have
     `tf.distribute.DistributedValues`, such as those produced by a
-    "distributed `Dataset`" or `experimental_distribute_values_from_function`
+    `tf.distribute.DistributedDataset` from
+    `tf.distribute.Strategy.experimental_distribute_dataset` or
+    `tf.distribute.Strategy.experimental_distribute_datasets_from_function`,
     when `fn` is executed on a particular replica, it will be executed with the
     component of `tf.distribute.DistributedValues` that correspond to that
     replica.
@@ -1333,6 +1439,65 @@ class StrategyBase(object):
   def __copy__(self):
     raise RuntimeError("Must only deepcopy DistributionStrategy.")
 
+  @property
+  def cluster_resolver(self):
+    """Returns the cluster resolver associated with this strategy.
+
+    In general, when using a multi-worker `tf.distribute` strategy such as
+    `tf.distribute.experimental.MultiWorkerMirroredStrategy` or
+    `tf.distribute.experimental.TPUStrategy()`, there is a
+    `tf.distribute.cluster_resolver.ClusterResolver` associated with the
+    strategy used, and such an instance is returned by this property.
+
+    Strategies that intend to have an associated
+    `tf.distribute.cluster_resolver.ClusterResolver` must set the
+    relevant attribute, or override this property; otherwise, `None` is returned
+    by default. Those strategies should also provide information regarding what
+    is returned by this property.
+
+    Single-worker strategies usually do not have a
+    `tf.distribute.cluster_resolver.ClusterResolver`, and in those cases this
+    property will return `None`.
+
+    The `tf.distribute.cluster_resolver.ClusterResolver` may be useful when the
+    user needs to access information such as the cluster spec, task type or task
+    id. For example,
+
+    ```python
+
+    os.environ['TF_CONFIG'] = json.dumps({
+      'cluster': {
+          'worker': ["localhost:12345", "localhost:23456"],
+          'ps': ["localhost:34567"]
+      },
+      'task': {'type': 'worker', 'index': 0}
+    })
+
+    # This implicitly uses TF_CONFIG for the cluster and current task info.
+    strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy()
+
+    ...
+
+    if strategy.cluster_resolver.task_type == 'worker':
+      # Perform something that's only applicable on workers. Since we set this
+      # as a worker above, this block will run on this particular instance.
+    elif strategy.cluster_resolver.task_type == 'ps':
+      # Perform something that's only applicable on parameter servers. Since we
+      # set this as a worker above, this block will not run on this particular
+      # instance.
+    ```
+
+    For more information, please see
+    `tf.distribute.cluster_resolver.ClusterResolver`'s API docstring.
+
+    Returns:
+      The cluster resolver associated with this strategy. Returns `None` if a
+      cluster resolver is not applicable or available in this strategy.
+    """
+    if hasattr(self.extended, "_cluster_resolver"):
+      return self.extended._cluster_resolver  # pylint: disable=protected-access
+    return None
+
 
 @tf_export("distribute.Strategy", v1=[])  # pylint: disable=g-missing-docstring
 class Strategy(StrategyBase):
@@ -1356,17 +1521,17 @@ class Strategy(StrategyBase):
     topology = tf.tpu.experimental.initialize_tpu_system(resolver)
     device_assignment = tf.tpu.experimental.DeviceAssignment.build(
         topology,
-        computation_shape=[1, 1, 2],
+        computation_shape=[1, 1, 1, 2],
         num_replicas=4)
-    strategy = tf.distribute.experimental.TPUStrategy(
-        resolver, device_assignment=device_assignment)
+    strategy = tf.distribute.TPUStrategy(
+        resolver, experimental_device_assignment=device_assignment)
     iterator = iter(inputs)
 
     @tf.function()
     def step_fn(inputs):
       output = tf.add(inputs, inputs)
 
-      // Add operation will be executed on logical device 0.
+      # Add operation will be executed on logical device 0.
       output = strategy.experimental_assign_to_logical_device(output, 0)
       return output
 
@@ -1411,10 +1576,10 @@ class Strategy(StrategyBase):
     topology = tf.tpu.experimental.initialize_tpu_system(resolver)
     device_assignment = tf.tpu.experimental.DeviceAssignment.build(
         topology,
-        computation_shape=[2, 2, 2],
+        computation_shape=[1, 2, 2, 2],
         num_replicas=1)
-    strategy = tf.distribute.experimental.TPUStrategy(
-        resolver, device_assignment=device_assignment)
+    strategy = tf.distribute.TPUStrategy(
+        resolver, experimental_device_assignment=device_assignment)
 
     iterator = iter(inputs)
 
@@ -1423,8 +1588,8 @@ class Strategy(StrategyBase):
       inputs = strategy.experimental_split_to_logical_devices(
         inputs, [1, 2, 4, 1])
 
-      // model() function will be executed on 8 logical devices with `inputs`
-      // split 2 * 4  ways.
+      # model() function will be executed on 8 logical devices with `inputs`
+      # split 2 * 4  ways.
       output = model(inputs)
       return output
 
@@ -1465,10 +1630,10 @@ class Strategy(StrategyBase):
     topology = tf.tpu.experimental.initialize_tpu_system(resolver)
     device_assignment = tf.tpu.experimental.DeviceAssignment.build(
         topology,
-        computation_shape=[1, 1, 2],
+        computation_shape=[1, 1, 1, 2],
         num_replicas=4)
-    strategy = tf.distribute.experimental.TPUStrategy(
-        resolver, device_assignment=device_assignment)
+    strategy = tf.distribute.TPUStrategy(
+        resolver, experimental_device_assignment=device_assignment)
 
     iterator = iter(inputs)
 
@@ -1478,12 +1643,12 @@ class Strategy(StrategyBase):
       images = strategy.experimental_split_to_logical_devices(
         inputs, [1, 2, 4, 1])
 
-      // model() function will be executed on 8 logical devices with `inputs`
-      // split 2 * 4  ways.
+      # model() function will be executed on 8 logical devices with `inputs`
+      # split 2 * 4  ways.
       output = model(inputs)
 
-      // For loss calculation, all logical devices share the same logits
-      // and labels.
+      # For loss calculation, all logical devices share the same logits
+      # and labels.
       labels = strategy.experimental_replicate_to_logical_devices(labels)
       output = strategy.experimental_replicate_to_logical_devices(output)
       loss = loss_fn(labels, output)
@@ -2853,6 +3018,9 @@ class _DefaultDistributionExtended(StrategyExtendedV1):
     def get_next(self):
       return self._iterator.get_next()
 
+    def get_next_as_optional(self):
+      return iterator_ops.get_next_as_optional(self._iterator)
+
     @deprecated(None, "Use the iterator's `initializer` property instead.")
     def initialize(self):
       """Initialize underlying iterators.
diff --git a/tensorflow/python/distribute/distribute_lib_test.py b/tensorflow/python/distribute/distribute_lib_test.py
index 8ea1cac6f02..b5924ec3b67 100644
--- a/tensorflow/python/distribute/distribute_lib_test.py
+++ b/tensorflow/python/distribute/distribute_lib_test.py
@@ -28,6 +28,7 @@ from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.distribute import distribution_strategy_context as ds_context
 from tensorflow.python.distribute import input_lib
 from tensorflow.python.distribute import reduce_util
+from tensorflow.python.distribute.cluster_resolver import SimpleClusterResolver
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.framework import constant_op
@@ -36,6 +37,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
+from tensorflow.python.training import server_lib
 from tensorflow.python.util import nest
 
 
@@ -422,6 +424,17 @@ class TestStrategyTest(test.TestCase):
 
     test_fn()
 
+  def testClusterResolverDefaultNotImplemented(self):
+    dist = _TestStrategy()
+    self.assertIsNone(dist.cluster_resolver)
+    base_cluster_spec = server_lib.ClusterSpec({
+        "ps": ["ps0:2222", "ps1:2222"],
+        "worker": ["worker0:2222", "worker1:2222", "worker2:2222"]
+    })
+    cluster_resolver = SimpleClusterResolver(base_cluster_spec)
+    dist.extended._cluster_resolver = cluster_resolver
+    self.assertIs(dist.cluster_resolver, cluster_resolver)
+
 
 # _TestStrategy2 is like _TestStrategy, except it doesn't change variable
 # creation.
diff --git a/tensorflow/python/distribute/distribute_utils.py b/tensorflow/python/distribute/distribute_utils.py
index ccf19521718..14b934b4a0f 100644
--- a/tensorflow/python/distribute/distribute_utils.py
+++ b/tensorflow/python/distribute/distribute_utils.py
@@ -148,6 +148,9 @@ def select_replica_mirrored(replica_id, structured):
         raise TypeError(
             "Expected value to be mirrored across replicas: %s in %s." %
             (x, structured))
+      packed_var = getattr(x, "_packed_variable", None)
+      if packed_var is not None:
+        return packed_var
       return x.values[replica_id]
     else:
       return x
diff --git a/tensorflow/python/distribute/input_lib.py b/tensorflow/python/distribute/input_lib.py
index 85e2dac1c1d..e4a362a92c6 100644
--- a/tensorflow/python/distribute/input_lib.py
+++ b/tensorflow/python/distribute/input_lib.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import collections
 import functools
 import sys
 
@@ -28,6 +29,7 @@ from tensorflow.python.data.experimental.ops import batching
 from tensorflow.python.data.experimental.ops import distribute
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import multi_device_iterator_ops
+from tensorflow.python.data.ops import optional_ops
 from tensorflow.python.distribute import device_util
 from tensorflow.python.distribute import distribute_utils
 from tensorflow.python.distribute import distribution_strategy_context
@@ -52,6 +54,8 @@ from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.types import distribute as distribute_types
 from tensorflow.python.util import nest
 from tensorflow.python.util.deprecation import deprecated
+from tensorflow.python.util.tf_export import tf_export
+from tensorflow.tools.docs import doc_controls
 
 
 def get_distributed_dataset(dataset,
@@ -138,6 +142,355 @@ def get_distributed_datasets_from_function(dataset_fn,
         strategy)
 
 
+@tf_export("distribute.DistributedIterator", v1=[])
+class DistributedIteratorInterface(collections.Iterator,
+                                   distribute_types.Iterator):
+  """An iterator over `tf.distribute.DistributedDataset`.
+
+  `tf.distribute.DistributedIterator` is the primary mechanism for enumerating
+  elements of a `tf.distribute.DistributedDataset`. It supports the Python
+  Iterator protocol, which means it can be iterated over using a for-loop or by
+  fetching individual elements explicitly via `get_next()`.
+
+  You can create a `tf.distribute.DistributedIterator` by calling `iter` on
+  a `tf.distribute.DistributedDataset` or creating a python loop over a
+  `tf.distribute.DistributedDataset`.
+
+  Visit the [tutorial](https://www.tensorflow.org/tutorials/distribute/input)
+  on distributed input for more examples and caveats.
+  """
+
+  def get_next(self):
+    """Returns the next input from the iterator for all replicas.
+
+    Example use:
+
+    >>> strategy = tf.distribute.MirroredStrategy()
+    >>> dataset = tf.data.Dataset.range(100).batch(2)
+    >>> dist_dataset = strategy.experimental_distribute_dataset(dataset)
+    >>> dist_dataset_iterator = iter(dist_dataset)
+    >>> @tf.function
+    ... def one_step(input):
+    ...   return input
+    >>> step_num = 5
+    >>> for _ in range(step_num):
+    ...   strategy.run(one_step, args=(dist_dataset_iterator.get_next(),))
+    >>> strategy.experimental_local_results(dist_dataset_iterator.get_next())
+    (<tf.Tensor: shape=(2,), dtype=int64, numpy=array([10, 11])>,)
+
+    The above example corresponds to the case where you have only one device. If
+    you have two devices, for example,
+    ```python
+    strategy = tf.distribute.MirroredStrategy(['/gpu:0', '/gpu:1'])
+    ```
+    Then the final line will print out:
+    ```python
+    (<tf.Tensor: shape=(1,), dtype=int64, numpy=array([10])>,
+     <tf.Tensor: shape=(1,), dtype=int64, numpy=array([11])>)
+    ```
+
+    Returns:
+      A single `tf.Tensor` or a `tf.distribute.DistributedValues` which contains
+      the next input for all replicas.
+
+    Raises:
+      `tf.errors.OutOfRangeError`: If the end of the iterator has been reached.
+    """
+    raise NotImplementedError(
+        "DistributedIterator.get_next() must be implemented in descendants.")
+
+  @property
+  def element_spec(self):
+    # pylint: disable=line-too-long
+    """The type specification of an element of `tf.distribute.DistributedIterator`.
+
+    Example usage:
+
+    >>> global_batch_size = 16
+    >>> strategy = tf.distribute.MirroredStrategy()
+    >>> dataset = tf.data.Dataset.from_tensors(([1.],[2])).repeat(100).batch(global_batch_size)
+    >>> distributed_iterator = iter(strategy.experimental_distribute_dataset(dataset))
+    >>> distributed_iterator.element_spec
+    (TensorSpec(shape=(None, 1), dtype=tf.float32, name=None),
+     TensorSpec(shape=(None, 1), dtype=tf.int32, name=None))
+
+    The above example corresponds to the case where you have only one device. If
+    you have two devices, for example,
+    ```python
+    strategy = tf.distribute.MirroredStrategy(['/gpu:0', '/gpu:1'])
+    ```
+    Then the final line will print out:
+    ```python
+    (PerReplicaSpec(TensorSpec(shape=(None, 1), dtype=tf.float32, name=None),
+                    TensorSpec(shape=(None, 1), dtype=tf.float32, name=None)),
+     PerReplicaSpec(TensorSpec(shape=(None, 1), dtype=tf.int32, name=None),
+                    TensorSpec(shape=(None, 1), dtype=tf.int32, name=None)))
+    ```
+
+    Returns:
+      A nested structure of `tf.TypeSpec` objects matching the structure of an
+      element of this `tf.distribute.DistributedIterator`. This returned value
+      is typically a `tf.distribute.DistributedValues` object and specifies the
+      `tf.TensorSpec` of individual components.
+    """
+    raise NotImplementedError(
+        "DistributedIterator.element_spec() must be implemented in descendants")
+
+  def get_next_as_optional(self):
+    """Returns a `tf.experimental.Optional` that contains the next value for all replicas.
+
+    If the `tf.distribute.DistributedIterator` has reached the end of the
+    sequence, the returned `tf.experimental.Optional` will have no value.
+
+    Example usage:
+
+    >>> strategy = tf.distribute.MirroredStrategy()
+    >>> global_batch_size = 2
+    >>> steps_per_loop = 2
+    >>> dataset = tf.data.Dataset.range(10).batch(global_batch_size)
+    >>> distributed_iterator = iter(
+    ...     strategy.experimental_distribute_dataset(dataset))
+    >>> def step_fn(x):
+    ...   return x
+    >>> @tf.function
+    ... def train_fn(distributed_iterator):
+    ...   for _ in tf.range(steps_per_loop):
+    ...     optional_data = distributed_iterator.get_next_as_optional()
+    ...     if not optional_data.has_value():
+    ...       break
+    ...     tf.print(strategy.run(step_fn, args=(optional_data.get_value(),)))
+    >>> train_fn(distributed_iterator)
+    ... # ([0 1],)
+    ... # ([2 3],)
+
+    Returns:
+      An `tf.experimental.Optional` object representing the next value from the
+      `tf.distribute.DistributedIterator` (if it has one) or no value.
+    """
+    raise NotImplementedError(
+        "get_next_as_optional() not implemented in descendants")
+
+
+@tf_export("distribute.DistributedDataset", v1=[])
+class DistributedDatasetInterface(collections.Iterable,
+                                  distribute_types.Iterable):
+  # pylint: disable=line-too-long
+  """Represents a dataset distributed among devices and machines.
+
+  A `tf.distribute.DistributedDataset` could be thought of as a "distributed"
+  dataset. When you use `tf.distribute` API to scale training to multiple
+  devices or machines, you also need to distribute the input data, which leads
+  to a `tf.distribute.DistributedDataset` instance, instead of a
+  `tf.data.Dataset` instance in the non-distributed case. In TF 2.x,
+  `tf.distribute.DistributedDataset` objects are Python iterables.
+
+  Note: `tf.distribute.DistributedDataset` instances are *not* of type
+  `tf.data.Dataset`. It only supports two usages we will mention below:
+  iteration and `element_spec`. We don't support any other APIs to transform or
+  inspect the dataset.
+
+  There are two APIs to create a `tf.distribute.DistributedDataset` object:
+  `tf.distribute.Strategy.experimental_distribute_dataset(dataset)`and
+  `tf.distribute.Strategy.experimental_distribute_datasets_from_function(dataset_fn)`.
+  *When to use which?* When you have a `tf.data.Dataset` instance, and the
+  regular batch splitting (i.e. re-batch the input `tf.data.Dataset` instance
+  with a new batch size that is equal to the global batch size divided by the
+  number of replicas in sync) and autosharding (i.e. the
+  `tf.data.experimental.AutoShardPolicy` options) work for you, use the former
+  API. Otherwise, if you are *not* using a canonical `tf.data.Dataset` instance,
+  or you would like to customize the batch splitting or sharding, you can wrap
+  these logic in a `dataset_fn` and use the latter API. Both API handles
+  prefetch to device for the user. For more details and examples, follow the
+  links to the APIs.
+
+
+  There are two main usages of a `DistributedDataset` object:
+
+  1. Iterate over it to generate the input for a single device or multiple
+  devices, which is a `tf.distribute.DistributedValues` instance. To do this,
+  you can:
+
+    * use a pythonic for-loop construct:
+
+      >>> global_batch_size = 2
+      >>> strategy = tf.distribute.MirroredStrategy()
+      >>> dataset = tf.data.Dataset.from_tensors(([1.],[1.])).repeat(4).batch(global_batch_size)
+      >>> dist_dataset = strategy.experimental_distribute_dataset(dataset)
+      >>> @tf.function
+      ... def train_step(input):
+      ...   features, labels = input
+      ...   return labels - 0.3 * features
+      >>> for x in dist_dataset:
+      ...   # train_step trains the model using the dataset elements
+      ...   loss = strategy.run(train_step, args=(x,))
+      ...   print("Loss is", loss)
+      Loss is tf.Tensor(
+      [[0.7]
+       [0.7]], shape=(2, 1), dtype=float32)
+      Loss is tf.Tensor(
+      [[0.7]
+       [0.7]], shape=(2, 1), dtype=float32)
+
+      Placing the loop inside a `tf.function` will give a performance boost.
+      However `break` and `return` are currently not supported if the loop is
+      placed inside a `tf.function`. We also don't support placing the loop
+      inside a `tf.function` when using
+      `tf.distribute.experimental.MultiWorkerMirroredStrategy` or
+      `tf.distribute.experimental.TPUStrategy` with multiple workers.
+
+    * use `__iter__` to create an explicit iterator, which is of type
+      `tf.distribute.DistributedIterator`
+
+      >>> global_batch_size = 4
+      >>> strategy = tf.distribute.MirroredStrategy()
+      >>> train_dataset = tf.data.Dataset.from_tensors(([1.],[1.])).repeat(50).batch(global_batch_size)
+      >>> train_dist_dataset = strategy.experimental_distribute_dataset(train_dataset)
+      >>> @tf.function
+      ... def distributed_train_step(dataset_inputs):
+      ...   def train_step(input):
+      ...     loss = tf.constant(0.1)
+      ...     return loss
+      ...   per_replica_losses = strategy.run(train_step, args=(dataset_inputs,))
+      ...   return strategy.reduce(tf.distribute.ReduceOp.SUM, per_replica_losses,axis=None)
+      >>> EPOCHS = 2
+      >>> STEPS = 3
+      >>> for epoch in range(EPOCHS):
+      ...   total_loss = 0.0
+      ...   num_batches = 0
+      ...   dist_dataset_iterator = iter(train_dist_dataset)
+      ...   for _ in range(STEPS):
+      ...     total_loss += distributed_train_step(next(dist_dataset_iterator))
+      ...     num_batches += 1
+      ...   average_train_loss = total_loss / num_batches
+      ...   template = ("Epoch {}, Loss: {}")
+      ...   print (template.format(epoch+1, average_train_loss))
+      Epoch 1, Loss: 0.10000000894069672
+      Epoch 2, Loss: 0.10000000894069672
+
+
+    To achieve a performance improvement, you can also wrap the `strategy.run`
+    call with a `tf.range` inside a `tf.function`. This runs multiple steps in a
+    `tf.function`. Autograph will convert it to a `tf.while_loop` on the worker.
+    However, it is less flexible comparing with running a single step inside
+    `tf.function`. For example, you cannot run things eagerly or arbitrary
+    python code within the steps.
+
+
+  2. Inspect the `tf.TypeSpec` of the data generated by `DistributedDataset`.
+
+    `tf.distribute.DistributedDataset` generates
+    `tf.distribute.DistributedValues` as input to the devices. If you pass the
+    input to a `tf.function` and would like to specify the shape and type of
+    each Tensor argument to the function, you can pass a `tf.TypeSpec` object to
+    the `input_signature` argument of the `tf.function`. To get the
+    `tf.TypeSpec` of the input, you can use the `element_spec` property of the
+    `tf.distribute.DistributedDataset` or `tf.distribute.DistributedIterator`
+    object.
+
+    For example:
+
+    >>> global_batch_size = 2
+    >>> epochs = 1
+    >>> steps_per_epoch = 1
+    >>> mirrored_strategy = tf.distribute.MirroredStrategy()
+    >>> dataset = tf.data.Dataset.from_tensors(([2.])).repeat(100).batch(global_batch_size)
+    >>> dist_dataset = mirrored_strategy.experimental_distribute_dataset(dataset)
+    >>> @tf.function(input_signature=[dist_dataset.element_spec])
+    ... def train_step(per_replica_inputs):
+    ...   def step_fn(inputs):
+    ...     return tf.square(inputs)
+    ...   return mirrored_strategy.run(step_fn, args=(per_replica_inputs,))
+    >>> for _ in range(epochs):
+    ...   iterator = iter(dist_dataset)
+    ...   for _ in range(steps_per_epoch):
+    ...     output = train_step(next(iterator))
+    ...     print(output)
+    tf.Tensor(
+    [[4.]
+     [4.]], shape=(2, 1), dtype=float32)
+
+
+  Visit the [tutorial](https://www.tensorflow.org/tutorials/distribute/input)
+  on distributed input for more examples and caveats.
+  """
+
+  def __iter__(self):
+    """Creates an iterator for the `tf.distribute.DistributedDataset`.
+
+    The returned iterator implements the Python Iterator protocol.
+
+    Example usage:
+
+    >>> global_batch_size = 4
+    >>> strategy = tf.distribute.MirroredStrategy()
+    >>> dataset = tf.data.Dataset.from_tensor_slices([1, 2, 3, 4]).repeat().batch(global_batch_size)
+    >>> distributed_iterator = iter(strategy.experimental_distribute_dataset(dataset))
+    >>> print(next(distributed_iterator))
+    tf.Tensor([1 2 3 4], shape=(4,), dtype=int32)
+
+
+    The above example corresponds to the case where you have only one device. If
+    you have two devices, for example,
+    ```python
+    strategy = tf.distribute.MirroredStrategy(['/gpu:0', '/gpu:1'])
+    ```
+    Then the final line will print out:
+    ```python
+    PerReplica:{
+      0: tf.Tensor([1 2], shape=(2,), dtype=int32),
+      1: tf.Tensor([3 4], shape=(2,), dtype=int32)
+    }
+    ```
+
+    Returns:
+      An `tf.distribute.DistributedIterator` instance for the given
+      `tf.distribute.DistributedDataset` object to enumerate over the
+      distributed data.
+    """
+    raise NotImplementedError("Must be implemented in descendants")
+
+  @property
+  def element_spec(self):
+    """The type specification of an element of this `tf.distribute.DistributedDataset`.
+
+    Example usage:
+
+    >>> global_batch_size = 16
+    >>> strategy = tf.distribute.MirroredStrategy()
+    >>> dataset = tf.data.Dataset.from_tensors(([1.],[2])).repeat(100).batch(global_batch_size)
+    >>> dist_dataset = strategy.experimental_distribute_dataset(dataset)
+    >>> dist_dataset.element_spec
+    (TensorSpec(shape=(None, 1), dtype=tf.float32, name=None),
+     TensorSpec(shape=(None, 1), dtype=tf.int32, name=None))
+
+    The above example corresponds to the case where you have only one device. If
+    you have two devices, for example,
+    ```python
+    strategy = tf.distribute.MirroredStrategy(['/gpu:0', '/gpu:1'])
+    ```
+    Then the final line will print out:
+    ```python
+    (PerReplicaSpec(TensorSpec(shape=(None, 1), dtype=tf.float32, name=None),
+                    TensorSpec(shape=(None, 1), dtype=tf.float32, name=None)),
+     PerReplicaSpec(TensorSpec(shape=(None, 1), dtype=tf.int32, name=None),
+                    TensorSpec(shape=(None, 1), dtype=tf.int32, name=None)))
+    ```
+
+    Returns:
+      A nested structure of `tf.TypeSpec` objects matching the structure of an
+      element of this `tf.distribute.DistributedDataset`. This returned value is
+      typically a `tf.distribute.DistributedValues` object and specifies the
+      `tf.TensorSpec` of individual components.
+    """
+    raise NotImplementedError(
+        "DistributedDataset.element_spec must be implemented in descendants.")
+
+  @doc_controls.do_not_generate_docs
+  def reduce(self, initial_state, reduce_func):
+    raise NotImplementedError(
+        "DistributedDataset.reduce must be implemented in descendants.")
+
+
 class InputWorkers(object):
   """A 1-to-many mapping from input worker devices to compute devices."""
 
@@ -259,9 +612,10 @@ def _get_static_shape(iterators):
     return static_shape
 
 
-class DistributedIteratorBase(distribute_types.Iterator):
+class DistributedIteratorBase(DistributedIteratorInterface):
   """Common implementation for all input iterators."""
 
+  # pylint: disable=super-init-not-called
   def __init__(self, input_workers, iterators, strategy):
     static_shape = _get_static_shape(iterators)
 
@@ -303,6 +657,31 @@ class DistributedIteratorBase(distribute_types.Iterator):
   def __iter__(self):
     return self
 
+  def get_next_as_optional(self):
+    global_has_value, replicas = _get_next_as_optional(self, self._strategy)
+
+    def return_none():
+      return optional_ops.Optional.empty(self._element_spec)
+
+    def return_value(replicas):
+      """Wraps the inputs for replicas in an `tf.experimental.Optional`."""
+      results = []
+      for i, worker in enumerate(self._input_workers.worker_devices):
+        with ops.device(worker):
+          devices = self._input_workers.compute_devices_for_worker(i)
+          for j, device in enumerate(devices):
+            with ops.device(device):
+              result = replicas[i][j]
+              results.append(result)
+      replicas = results
+
+      return optional_ops.Optional.from_value(
+          distribute_utils.regroup(replicas))
+
+    return control_flow_ops.cond(global_has_value,
+                                 lambda: return_value(replicas),
+                                 lambda: return_none())  # pylint: disable=unnecessary-lambda
+
   def get_next(self, name=None):
     """Returns the next input from the iterator for all replicas."""
     if not self._enable_get_next_as_optional:
@@ -548,9 +927,10 @@ class DistributedIterator(DistributedIteratorBase,
                                    self._strategy)
 
 
-class _IterableInput(distribute_types.Iterable):
+class _IterableInput(DistributedDatasetInterface):
   """Base class for iterable inputs for distribution strategies."""
 
+  # pylint: disable=super-init-not-called
   def __init__(self, input_workers):
     assert isinstance(input_workers, InputWorkers)
     self._input_workers = input_workers
diff --git a/tensorflow/python/distribute/input_lib_test.py b/tensorflow/python/distribute/input_lib_test.py
index ff4436c4c8c..7f02d0121d0 100644
--- a/tensorflow/python/distribute/input_lib_test.py
+++ b/tensorflow/python/distribute/input_lib_test.py
@@ -185,38 +185,76 @@ class DistributedIteratorTestBase(test.TestCase):
       if not ops.executing_eagerly_outside_functions():
         evaluate(control_flow_ops.group(iterator.initializer))
 
-      for expected_value in expected_values:
-        next_element = iterator.get_next()
-        computed_value = evaluate(
-            [distribute_utils.select_replica(r, next_element)
-             for r in range(len(devices))])
-        self.assertEqual(len(expected_value), len(computed_value))
-        for i in range(len(expected_value)):
-          self.assertAllEqual(expected_value[i], computed_value[i])
+      def test_get_next(iterator):
+        for expected_value in expected_values:
+          next_element = iterator.get_next()
+          computed_value = evaluate([
+              distribute_utils.select_replica(r, next_element)
+              for r in range(len(devices))
+          ])
 
-      with self.assertRaises(errors.OutOfRangeError):
-        next_element = iterator.get_next()
-        evaluate(
-            [distribute_utils.select_replica(r, next_element)
-             for r in range(len(devices))])
+          self.assertEqual(len(expected_value), len(computed_value))
+          for i in range(len(expected_value)):
+            self.assertAllEqual(expected_value[i], computed_value[i])
 
-      # After re-initializing the iterator, should be able to iterate again.
-      if not ops.executing_eagerly_outside_functions():
-        evaluate(control_flow_ops.group(iterator.initializer))
+        with self.assertRaises(errors.OutOfRangeError):
+          next_element = iterator.get_next()
+          evaluate([
+              distribute_utils.select_replica(r, next_element)
+              for r in range(len(devices))
+          ])
+
+        # After re-initializing the iterator, should be able to iterate again.
+        if not ops.executing_eagerly_outside_functions():
+          evaluate(control_flow_ops.group(iterator.initializer))
+        else:
+          if api_type == "wrap_into_iterator":
+            self.skipTest("unsupported test combination")
+          else:
+            iterator = iter(dataset)
+
+        for expected_value in expected_values:
+          next_element = iterator.get_next()
+          computed_value = evaluate([
+              distribute_utils.select_replica(r, next_element)
+              for r in range(len(devices))
+          ])
+          self.assertEqual(len(expected_value), len(computed_value))
+          for i in range(len(expected_value)):
+            self.assertAllEqual(expected_value[i], computed_value[i])
+
+      def test_get_next_as_optional(iterator):
+        for expected_value in expected_values:
+          next_element = iterator.get_next_as_optional()
+          computed_value = evaluate([
+              distribute_utils.select_replica(r, next_element.get_value())
+              for r in range(len(devices))
+          ])
+
+          self.assertEqual(len(expected_value), len(computed_value))
+          for i in range(len(expected_value)):
+            self.assertAllEqual(expected_value[i], computed_value[i])
+
+        next_element = iterator.get_next_as_optional()
+        self.assertFalse(self.evaluate(next_element.has_value()))
+        with self.assertRaises(errors.InvalidArgumentError):
+          evaluate([
+              distribute_utils.select_replica(r, next_element.get_value())
+              for r in range(len(devices))
+          ])
+
+      test_get_next(iterator)
+
+      # re-initializing the iterator
+      if not tf2.enabled():
+        self.skipTest("Not testing get_next_as_optional in TF1")
       else:
         if api_type == "wrap_into_iterator":
           self.skipTest("unsupported test combination")
         else:
           iterator = iter(dataset)
 
-      for expected_value in expected_values:
-        next_element = iterator.get_next()
-        computed_value = evaluate(
-            [distribute_utils.select_replica(r, next_element)
-             for r in range(len(devices))])
-        self.assertEqual(len(expected_value), len(computed_value))
-        for i in range(len(expected_value)):
-          self.assertAllEqual(expected_value[i], computed_value[i])
+      test_get_next_as_optional(iterator)
 
     if iteration_type == "for_loop" and context.executing_eagerly():
       actual_values = []
diff --git a/tensorflow/python/distribute/mirrored_strategy.py b/tensorflow/python/distribute/mirrored_strategy.py
index 36598634fac..e01b5e6792d 100644
--- a/tensorflow/python/distribute/mirrored_strategy.py
+++ b/tensorflow/python/distribute/mirrored_strategy.py
@@ -190,9 +190,8 @@ class MirroredStrategy(distribute_lib.Strategy):
 
   This strategy is typically used for training on one
   machine with multiple GPUs. For TPUs, use
-  `tf.distribute.experimental.TPUStrategy`. To use `MirroredStrategy` with
-  multiple workers, please refer to
-  `tf.distribute.experimental.MultiWorkerMirroredStrategy`.
+  `tf.distribute.TPUStrategy`. To use `MirroredStrategy` with multiple workers,
+  please refer to `tf.distribute.experimental.MultiWorkerMirroredStrategy`.
 
   For example, a variable created under a `MirroredStrategy` is a
   `MirroredVariable`. If no devices are specified in the constructor argument of
diff --git a/tensorflow/python/distribute/mirrored_strategy_test.py b/tensorflow/python/distribute/mirrored_strategy_test.py
index 950b6f2446b..e6414b2704a 100644
--- a/tensorflow/python/distribute/mirrored_strategy_test.py
+++ b/tensorflow/python/distribute/mirrored_strategy_test.py
@@ -22,7 +22,6 @@ import json
 import sys
 
 from absl.testing import parameterized
-import numpy as np
 
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python import tf2
@@ -50,16 +49,12 @@ from tensorflow.python.framework import func_graph
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
-from tensorflow.python.keras.engine import training as keras_training
-from tensorflow.python.keras.layers import core as keras_core
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gradients
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
-from tensorflow.python.training import gradient_descent
-from tensorflow.python.training import optimizer as optimizer_lib
 from tensorflow.python.training import server_lib
 
 
@@ -988,22 +983,6 @@ class MockModel(object):
     return x
 
 
-class MiniModel(keras_training.Model):
-  """Minimal model for mnist.
-
-  Useful for testing and debugging on slow TPU simulators.
-  """
-
-  def __init__(self):
-    super(MiniModel, self).__init__(name="")
-    self.fc = keras_core.Dense(1, name="fc", kernel_initializer="ones",
-                               bias_initializer="ones")
-
-  def call(self, inputs, training=True):
-    inputs = array_ops.ones([1, 10])
-    return self.fc(inputs)
-
-
 @combinations.generate(
     combinations.combine(
         distribution=[
@@ -1116,32 +1095,6 @@ class MirroredStrategyDefunTest(test.TestCase):
     expected_result = values.PerReplica((5.0 * 1.25, 3.0 * 1.25))
     self._call_and_check(distribution, fn1, [factors], expected_result, [fn1])
 
-  def testTrain(self, distribution):
-    with distribution.scope():
-      mock_model = MiniModel()
-      mock_model.call = function.defun(mock_model.call)
-
-      def loss_fn(ctx):
-        del ctx
-        return mock_model(array_ops.ones([1, 10]))
-
-      gradients_fn = backprop.implicit_grad(loss_fn)
-      gradients_fn = optimizer_lib.get_filtered_grad_fn(gradients_fn)
-      grads_and_vars = distribution.extended.call_for_each_replica(
-          gradients_fn, args=(None,))
-
-      optimizer = gradient_descent.GradientDescentOptimizer(0.25)
-      update_ops = optimizer._distributed_apply(distribution, grads_and_vars)  # pylint: disable=protected-access
-
-      if not context.executing_eagerly():
-        self.evaluate(variables.global_variables_initializer())
-        self.evaluate(update_ops)
-
-      updated_var_values = self.evaluate(mock_model.variables)
-      # All variables start at 1.0 and get two updates of 0.25.
-      self.assertAllEqual(0.5 * np.ones([10, 1]), updated_var_values[0])
-      self.assertAllEqual([0.5], updated_var_values[1])
-
 
 @combinations.generate(
     combinations.combine(
diff --git a/tensorflow/python/distribute/mirrored_variable_test.py b/tensorflow/python/distribute/mirrored_variable_test.py
index 6623422b45f..df32a6babea 100644
--- a/tensorflow/python/distribute/mirrored_variable_test.py
+++ b/tensorflow/python/distribute/mirrored_variable_test.py
@@ -18,7 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.distribute import collective_all_reduce_strategy
 from tensorflow.python.distribute import combinations
 from tensorflow.python.distribute import distribution_strategy_context as ds_context
@@ -32,7 +31,6 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import func_graph
 from tensorflow.python.framework import ops
-from tensorflow.python.keras.layers import core
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import rnn
@@ -208,38 +206,6 @@ class MirroredVariableCreationTest(test.TestCase):
       # The resulting mirrored variable will use the name from the first device.
       self.assertEqual("foo_0:0", result.name)
 
-  def testWithLayers(self, distribution):
-
-    def model_fn(features):
-
-      layer1 = core.Dense(1)
-      layer1(features)
-      layer2 = core.Dense(1)
-      layer2(features)
-      # We rely on names and orders to make sure replica references the same
-      # MirroredVariable. Uniquifying names may involve global states,
-      # merge_call switches threads so we need to test things work after
-      # merge_call.
-      ds_context.get_replica_context().merge_call(lambda _: _)
-      layer3 = core.Dense(1)
-      layer3(features)
-      return [(layer1.kernel, layer1.bias), (layer2.kernel, layer2.bias),
-              (layer3.kernel, layer3.bias)]
-
-    iterator = distribution.make_input_fn_iterator(
-        lambda _: dataset_ops.Dataset.from_tensors([[1.]]).repeat(10))
-    self.evaluate(iterator.initializer)
-    features = iterator.get_next()
-
-    with distribution.scope():
-      result = distribution.extended.call_for_each_replica(
-          model_fn, args=(features,))
-      for kernel, bias in result:
-        self.assertIsInstance(kernel, values.MirroredVariable)
-        self.assertAllDifferent(distribution.experimental_local_results(kernel))
-        self.assertIsInstance(bias, values.MirroredVariable)
-        self.assertAllDifferent(distribution.experimental_local_results(kernel))
-
   def testWithVariableAndVariableScope(self, distribution):
 
     def model_fn():
diff --git a/tensorflow/python/distribute/multi_process_runner.py b/tensorflow/python/distribute/multi_process_runner.py
index 8699e59b410..84b61be1ea2 100644
--- a/tensorflow/python/distribute/multi_process_runner.py
+++ b/tensorflow/python/distribute/multi_process_runner.py
@@ -67,8 +67,7 @@ except ImportError:
 # exception stack trace info is stored in exc_info to pass on to parent process
 # to be re-raised.
 _ProcessStatusInfo = collections.namedtuple(
-    '_ProcessStatusInfo',
-    ['task_type', 'is_successful', 'exc_info', 'return_value'])
+    '_ProcessStatusInfo', ['is_successful', 'exc_info', 'return_value'])
 
 # Information returned from a successful MultiProcessRunner run.
 MultiProcessRunnerResult = collections.namedtuple('MultiProcessRunnerResult',
@@ -124,6 +123,7 @@ class MultiProcessRunner(object):
                stream_stdout=True,
                list_stdout=False,
                use_dill_for_args=True,
+               daemon=False,
                args=None,
                kwargs=None):
     """Creates a multi-process runner.
@@ -157,6 +157,7 @@ class MultiProcessRunner(object):
       use_dill_for_args: Whether to use dill to pickle `args` and `kwargs`. dill
         can pickle more objects, but doesn't work with types in
         `multiprocessing` library like `Mutex`.
+      daemon: Whether to start processes as daemons.
       args: Positional arguments to be sent to functions run on processes.
       kwargs: Keyword arguments to be sent to functions run on processes.
 
@@ -188,6 +189,7 @@ class MultiProcessRunner(object):
     self._list_stdout = list_stdout
     self._dependence_on_chief = True
     self._use_dill_for_args = use_dill_for_args
+    self._daemon = daemon
     self._args = args or ()
     self._kwargs = kwargs or {}
 
@@ -268,7 +270,8 @@ class MultiProcessRunner(object):
         test_env=test_env,
         target=_ProcFunc(),
         args=(resources, test_env, proc_func, args, kwargs,
-              self._use_dill_for_args))
+              self._use_dill_for_args),
+        daemon=self._daemon)
     p.start()
     self._processes[(task_type, task_id)] = p
     self._outstanding_subprocess_count += 1
@@ -423,6 +426,18 @@ class MultiProcessRunner(object):
   def join(self, timeout=_DEFAULT_TIMEOUT_SEC):
     """Joins all the processes with timeout.
 
+    If any of the subprocesses does not exit approximately after `timeout`
+    seconds has passed after `join` call, this raises a
+    `SubprocessTimeoutError`.
+
+    Note: At timeout, it uses SIGTERM to terminate the subprocesses, in order to
+    log the stack traces of the subprocesses when they exit. However, this
+    results in timeout when the test runs with tsan (thread sanitizer); if tsan
+    is being run on the test targets that rely on timeout to assert information,
+    `MultiProcessRunner.terminate_all()` must be called after `join()`, before
+    the test exits, so the subprocesses are terminated with SIGKILL, and data
+    race is removed.
+
     Args:
       timeout: if set and not all processes report status within roughly
         `timeout` seconds, a `SubprocessTimeoutError` exception will be raised.
@@ -568,7 +583,6 @@ class _ProcFunc(object):
         time.sleep(0.1)
     self._resources.process_status_queue.put(
         _ProcessStatusInfo(
-            task_type=task_type,
             is_successful=True,
             exc_info=None,
             return_value=None))
@@ -628,17 +642,9 @@ class _ProcFunc(object):
     if test_env.v2_enabled:
       v2_compat.enable_v2_behavior()
 
-    try:
-      with self._runtime_mode(test_env.executing_eagerly):
-        return_value = proc_func(*args, **kwargs)
-        is_successful = True
-        exc_info = None
-
-    except Exception:  # pylint: disable=broad-except
-      # Capture all exceptions to be reported to parent process.
-      return_value = None
-      is_successful = False
-      exc_info = sys.exc_info()
+    with self._runtime_mode(test_env.executing_eagerly):
+      info = _run_contained(proc_func, args, kwargs)
+      self._resources.process_status_queue.put(info)
 
       # Re-raise the exception in addition to reporting it to the parent
       # process, so that even if `--test_timeout` flag is set and the
@@ -647,18 +653,188 @@ class _ProcFunc(object):
       # instead of silently suppressing the error due to early bazel
       # timeout. Raising an error in the subprocess produces stack trace in
       # the log, but the program continues running.
-      raise
+      if not info.is_successful:
+        six.reraise(*info.exc_info)
 
-    finally:
-      info = _ProcessStatusInfo(
-          task_type=test_env.task_type,
-          is_successful=is_successful,
-          exc_info=exc_info,
-          return_value=return_value)
-      self._resources.process_status_queue.put(info)
       self._close_streaming()
 
 
+class MultiProcessPoolRunner(object):
+  """A utility class to start a process pool to simulate a cluster.
+
+  It's similar to MultiProcessRunner, but uses a pool of processes to avoid the
+  expensive initialization cost of Tensorflow.
+  """
+
+  def __init__(self, cluster_spec, initializer=None):
+    """Creates a multi-process pool runner.
+
+    Args:
+      cluster_spec: Dict for cluster spec. The following is an example of
+        cluster with three workers.
+        {"worker": ["worker0.example.com:2222",
+                    "worker1.example.com:2222",
+                    "worker2.example.com:2222"]}
+      initializer: a callable to called at the startup of worker processes.
+
+    Raises:
+      RuntimeError: if `multi_process_runner.test_main()` is not called.
+      ValueError: if there are more than one chief in the `cluster_spec`.
+    """
+    self._cluster_spec = cluster_spec
+    self._initializer = initializer
+    self._conn = {}
+    self._runner = None
+
+  def __del__(self):
+    self._reset()
+
+  def _reset(self):
+    for conn in self._conn.values():
+      conn.close()
+    self._conn = {}
+    if self._runner is not None:
+      self._runner.join()
+      self._runner = None
+
+  def _start(self):
+    """Starts the worker pool."""
+    # We need different arguments for different processes so we're passing a
+    # no-op proc_func here and use start_single_process instead.
+    #
+    # We also need to start the process pool as daemon, so that they don't block
+    # the program from exiting. Note that __del__ may not get called when
+    # there's an exception. The user may also store a pool runner in a global
+    # object to share across test cases
+
+    if dill is None:
+      raise unittest.SkipTest(
+          'TODO(b/150264776): Resolve dependency issue in CI')
+
+    self._runner = MultiProcessRunner(
+        proc_func=lambda: None,
+        cluster_spec=self._cluster_spec,
+        use_dill_for_args=False,
+        daemon=True)
+    if self._initializer:
+      initializer = dill.dumps(self._initializer, dill.HIGHEST_PROTOCOL)
+    else:
+      initializer = None
+    for task_type, addresses in self._cluster_spec.items():
+      for task_id, _ in enumerate(addresses):
+        conn1, conn2 = multiprocessing.Pipe(duplex=True)
+        self._conn[(task_type, task_id)] = conn1
+        self._runner.start_single_process(
+            task_type,
+            task_id,
+            proc_func=_pool_runner_worker,
+            args=(initializer, conn2))
+
+  def run(self, proc_func, args=None, kwargs=None):
+    """Runs `proc_func` with `args` and `kwargs` on all jobs.
+
+    Args:
+      proc_func: The function to be run.
+      args: Optional positional arguments to be supplied in `proc_func`.
+      kwargs: Optional keyword arguments to be supplied in `proc_func`.
+
+    Returns:
+      A list of return values.
+    """
+    if self._runner is None:
+      self._start()
+
+    # Since we start the processes as daemon they're going to be killed by
+    # SIGTERM when the program exits. We only turn on streaming during run() to
+    # avoid printing the stacktrace caused by the SIGTERM.
+    self._runner._stream_stdout = True  # pylint: disable=protected-access
+
+    try:
+      proc_func = dill.dumps(proc_func, dill.HIGHEST_PROTOCOL)
+      for conn in self._conn.values():
+        conn.send((proc_func, args or [], kwargs or {}))
+
+      process_statuses = []
+      for (task_type, task_id), conn in self._conn.items():
+        logging.info('Waiting for the result from %s-%d', task_type, task_id)
+        try:
+          process_statuses.append(conn.recv())
+        except EOFError:
+          # This shouldn't happen due to exceptions in proc_func. This usually
+          # means bugs in the runner.
+          self._reset()
+          raise RuntimeError('Unexpected EOF. Worker process may have died. '
+                             'Please report a bug')
+
+      return_values = []
+      for process_status in process_statuses:
+        assert isinstance(process_status, _ProcessStatusInfo)
+        if not process_status.is_successful:
+          six.reraise(*process_status.exc_info)
+        if process_status.return_value is not None:
+          return_values.append(process_status.return_value)
+
+      return return_values
+    finally:
+      self._runner._stream_stdout = False  # pylint: disable=protected-access
+
+
+def _pool_runner_worker(initializer, conn):
+  """Function that runs on the workers in a pool.
+
+  It listens for callables to run and returns the result until `conn` is closed.
+  It captures the exceptions during executing the callable and return it through
+  `conn`.
+
+  Args:
+    initializer: A callable to execute during startup.
+    conn: A multiprocessing.Connection object to listen for tasks and send
+      results.
+  """
+  if initializer:
+    initializer = dill.loads(initializer)
+    initializer()
+  while True:
+    try:
+      proc_func, args, kwargs = conn.recv()
+    except EOFError:
+      break
+    proc_func = dill.loads(proc_func)
+    info = _run_contained(proc_func, args, kwargs)
+    sys.stdout.flush()
+    sys.stderr.flush()
+    conn.send(info)
+
+
+def _run_contained(proc_func, args, kwargs):
+  """Runs `proc_func` with `args` and `kwargs`.
+
+  The function returns _ProcessStatusInfo which captures the return value and
+  the exception.
+
+  Args:
+    proc_func: The function to be run.
+    args: Optional positional arguments to be supplied in `proc_func`.
+    kwargs: Optional keyword arguments to be supplied in `proc_func`.
+
+  Returns:
+    a _ProcessStatusInfo.
+  """
+  try:
+    return_value = proc_func(*args, **kwargs)
+    is_successful = True
+    exc_info = None
+  except Exception:  # pylint: disable=broad-except
+    return_value = None
+    is_successful = False
+    exc_info = sys.exc_info()
+  finally:
+    return _ProcessStatusInfo(  # pylint: disable=lost-exception
+        is_successful=is_successful,
+        exc_info=exc_info,
+        return_value=return_value)
+
+
 class SubprocessTimeoutError(RuntimeError):
   """An error that indicates there is at least one subprocess timing out.
 
diff --git a/tensorflow/python/distribute/multi_process_runner_test.py b/tensorflow/python/distribute/multi_process_runner_test.py
index aeba43b6b7c..32d3ae6c84e 100644
--- a/tensorflow/python/distribute/multi_process_runner_test.py
+++ b/tensorflow/python/distribute/multi_process_runner_test.py
@@ -22,6 +22,8 @@ import json
 import os
 import threading
 import time
+import unittest
+
 from absl import logging
 
 from tensorflow.python.distribute import multi_process_runner
@@ -45,7 +47,7 @@ def proc_func_that_adds_simple_return_data():
   return 'dummy_data'
 
 
-def proc_func_that_return_args_and_kwargs(*args, **kwargs):
+def proc_func_that_returns_args_and_kwargs(*args, **kwargs):
   return list(args) + list(kwargs.items())
 
 
@@ -53,6 +55,20 @@ def proc_func_with_barrier():
   return multi_process_runner.barrier()
 
 
+def proc_func_that_returns_pid():
+  return os.getpid()
+
+
+V = None
+
+
+def proc_func_that_sets_global(val):
+  global V
+  old_val = V
+  V = val
+  return old_val
+
+
 class MultiProcessRunnerTest(test.TestCase):
 
   def _worker_idx(self):
@@ -95,7 +111,7 @@ class MultiProcessRunnerTest(test.TestCase):
 
   def test_multi_process_runner_args_passed_correctly(self):
     return_value = multi_process_runner.run(
-        proc_func_that_return_args_and_kwargs,
+        proc_func_that_returns_args_and_kwargs,
         multi_worker_test_base.create_cluster_spec(num_workers=1),
         args=('a', 'b'),
         kwargs={
@@ -123,24 +139,6 @@ class MultiProcessRunnerTest(test.TestCase):
                   std_stream_results)
     self.assertIn('This is returned data.', return_value)
 
-  def test_process_that_exits(self):
-
-    def func_to_exit_in_25_sec():
-      logging.error('foo')
-      time.sleep(100)
-      logging.error('bar')
-
-    mpr = multi_process_runner.MultiProcessRunner(
-        func_to_exit_in_25_sec,
-        multi_worker_test_base.create_cluster_spec(num_workers=1),
-        list_stdout=True,
-        max_run_time=25)
-
-    mpr.start()
-    stdout = mpr.join().stdout
-    self.assertLen([msg for msg in stdout if 'foo' in msg], 1)
-    self.assertLen([msg for msg in stdout if 'bar' in msg], 0)
-
   def test_termination(self):
 
     def proc_func():
@@ -300,29 +298,70 @@ class MultiProcessRunnerTest(test.TestCase):
   def test_stdout_available_when_timeout(self):
 
     def proc_func():
-      for i in range(50):
-        logging.info('(logging) %s-%d, i: %d',
-                     multi_worker_test_base.get_task_type(), self._worker_idx(),
-                     i)
-        time.sleep(1)
+      logging.info('something printed')
+      time.sleep(10000)  # Intentionally make the test timeout.
 
     with self.assertRaises(multi_process_runner.SubprocessTimeoutError) as cm:
-      multi_process_runner.run(
+      mpr = multi_process_runner.MultiProcessRunner(
           proc_func,
-          multi_worker_test_base.create_cluster_spec(num_workers=1, num_ps=1),
-          list_stdout=True,
-          timeout=5)
+          multi_worker_test_base.create_cluster_spec(num_workers=1),
+          list_stdout=True)
+      mpr.start()
+      mpr.join(timeout=60)
+    mpr.terminate_all()
 
     list_to_assert = cm.exception.mpr_result.stdout
-    # We should see 5 iterations from worker and ps, however sometime on TAP
-    # due to CPU throttling and slugginess of msan/asan build, this became
-    # flaky. Therefore we allow more margin of errors to only check the first
-    # 3 iterations.
-    for job in ['worker', 'ps']:
-      for iteration in range(0, 3):
-        self.assertTrue(
-            any('(logging) {}-0, i: {}'.format(job, iteration) in line
-                for line in list_to_assert))
+    self.assertTrue(
+        any('something printed' in line for line in list_to_assert))
+
+
+class MultiProcessPoolRunnerTest(test.TestCase):
+
+  def test_same_process_across_runs(self):
+    cluster_spec = multi_worker_test_base.create_cluster_spec(num_workers=2)
+    runner = multi_process_runner.MultiProcessPoolRunner(cluster_spec)
+    pid = runner.run(proc_func_that_returns_pid)
+    for _ in range(3):
+      self.assertAllEqual(runner.run(proc_func_that_returns_pid), pid)
+
+  def test_exceptions_in_sub_process(self):
+    cluster_spec = multi_worker_test_base.create_cluster_spec(num_workers=2)
+    runner = multi_process_runner.MultiProcessPoolRunner(cluster_spec)
+    pid = runner.run(proc_func_that_returns_pid)
+    with self.assertRaisesRegexp(ValueError, 'This is an error.'):
+      runner.run(proc_func_that_errors)
+    self.assertAllEqual(runner.run(proc_func_that_returns_pid), pid)
+
+  def test_tf_config(self):
+    cluster_spec = multi_worker_test_base.create_cluster_spec(
+        has_chief=True, num_workers=2)
+    runner = multi_process_runner.MultiProcessPoolRunner(cluster_spec)
+    result = runner.run(proc_func_that_adds_task_type_in_return_data)
+
+    job_count_dict = {'worker': 2, 'chief': 1}
+    for data in result:
+      job_count_dict[data] -= 1
+
+    self.assertEqual(job_count_dict['worker'], 0)
+    self.assertEqual(job_count_dict['chief'], 0)
+
+  @unittest.expectedFailure
+  def test_exception_in_main_process(self):
+    # When there's an exception in the main process, __del__() is not called.
+    # This test is to verify MultiProcessPoolRunner can cope with __del__() not
+    # being called.
+    cluster_spec = multi_worker_test_base.create_cluster_spec(
+        has_chief=True, num_workers=2)
+    runner = multi_process_runner.MultiProcessPoolRunner(cluster_spec)
+    runner.run(proc_func_that_returns_pid)
+    raise ValueError('failure')
+
+  def test_initializer(self):
+    cluster_spec = multi_worker_test_base.create_cluster_spec(num_workers=2)
+    runner = multi_process_runner.MultiProcessPoolRunner(
+        cluster_spec, initializer=lambda: proc_func_that_sets_global(1))
+    result = runner.run(proc_func_that_sets_global, args=(2,))
+    self.assertAllEqual(result, [1, 1])
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/distribute/packed_distributed_variable.py b/tensorflow/python/distribute/packed_distributed_variable.py
index 62512cb4414..c249b8efc1c 100644
--- a/tensorflow/python/distribute/packed_distributed_variable.py
+++ b/tensorflow/python/distribute/packed_distributed_variable.py
@@ -42,7 +42,7 @@ class PackedDistributedVariable(resource_variable_ops.BaseResourceVariable):
       name: Optional name for the variable. Defaults to `'Variable'` and gets
         uniquified automatically.
     """
-    if not context.executing_eagerly():
+    if not ops.executing_eagerly_outside_functions():
       raise ValueError(
           "PackedDistributedVariable should be created in eager mode.")
     if not distributed_variables:
@@ -84,6 +84,9 @@ class PackedDistributedVariable(resource_variable_ops.BaseResourceVariable):
   def devices(self):
     return self._devices
 
+  def on_device(self, device):
+    return PackedVarAndDevice(self, device)
+
   def get_var_on_device(self, device):
     for i, d in enumerate(self._devices):
       if d == device:
@@ -100,7 +103,10 @@ class PackedDistributedVariable(resource_variable_ops.BaseResourceVariable):
 
   @property
   def handle(self):
-    return self._handle
+    if context.executing_eagerly():
+      return self.get_var_on_current_device().handle
+    else:
+      return self._handle
 
   def _read_variable_op(self):
     if context.executing_eagerly():
@@ -269,7 +275,8 @@ class PackedVarAndDevice(object):
 
   @property
   def handle(self):
-    return self._var.handle
+    with ops.device(self._device):
+      return self._var.handle
 
   @property
   def op(self):
diff --git a/tensorflow/python/distribute/packed_distributed_variable_test.py b/tensorflow/python/distribute/packed_distributed_variable_test.py
index d29d19960a5..ec2e476e4b8 100644
--- a/tensorflow/python/distribute/packed_distributed_variable_test.py
+++ b/tensorflow/python/distribute/packed_distributed_variable_test.py
@@ -46,7 +46,7 @@ class PackedDistributedVariableTest(test.TestCase):
       v1 = resource_variable_ops.ResourceVariable(2.0, name='var1')
 
     packed_var = packed_distributed_variable.PackedDistributedVariable([v0, v1])
-    self.assertTrue(packed_var.handle.is_packed)
+    self.assertFalse(packed_var.handle.is_packed)
     self.assertTrue(packed_var.is_initialized)
 
     with ops.device('/cpu:0'):
@@ -61,6 +61,7 @@ class PackedDistributedVariableTest(test.TestCase):
 
     @def_function.function
     def update_var():
+      self.assertTrue(packed_var.handle.is_packed)
       with ops.device('/cpu:0'):
         packed_var.assign_add(3.0).assign_sub(1.0)
         read0 = packed_var.value()
@@ -85,7 +86,7 @@ class PackedDistributedVariableTest(test.TestCase):
 
     packed_var0 = packed_distributed_variable.PackedVarAndDevice(
         packed_var, device0)
-    self.assertTrue(packed_var0.handle.is_packed)
+    self.assertFalse(packed_var0.handle.is_packed)
     self.assertAllEqual(math_ops.mul(packed_var0, 2.0), 2.0)
 
     packed_var1 = packed_distributed_variable.PackedVarAndDevice(
@@ -94,6 +95,7 @@ class PackedDistributedVariableTest(test.TestCase):
 
     @def_function.function
     def func():
+      self.assertTrue(packed_var.handle.is_packed)
       var0 = packed_distributed_variable.PackedVarAndDevice(packed_var, device0)
       var0.assign_add(3.0)
       var1 = packed_distributed_variable.PackedVarAndDevice(packed_var, device1)
diff --git a/tensorflow/python/distribute/parallel_device/parallel_device_test.py b/tensorflow/python/distribute/parallel_device/parallel_device_test.py
index 8fc3dcb5816..1429c522aba 100644
--- a/tensorflow/python/distribute/parallel_device/parallel_device_test.py
+++ b/tensorflow/python/distribute/parallel_device/parallel_device_test.py
@@ -172,6 +172,8 @@ class ParallelDeviceTests(_VirtualDeviceTestCase):
       config.set_synchronous_execution(previous)
 
   def test_checkpointing(self):
+    self.skipTest(
+        "Disable saving until SaveableObject's methods are traceable.")
     prefix = os.path.join(self.get_temp_dir(), "ckpt")
     with self.device.scope():
       different_values = self.device.pack(
@@ -263,6 +265,8 @@ class LayerTests(_VirtualDeviceTestCase):
     self.assertIn(self.device.components[1], final_kernels[1].backing_device)
 
   def test_training_loop(self):
+    self.skipTest(
+        "Disable saving until SaveableObject's methods are traceable.")
     for _ in range(5):
       layer = _Dense(5)
       checkpoint = tracking.Checkpoint(layer=layer)
diff --git a/tensorflow/python/distribute/ps_values.py b/tensorflow/python/distribute/ps_values.py
index 37cd6e12d90..5fb2d42b626 100644
--- a/tensorflow/python/distribute/ps_values.py
+++ b/tensorflow/python/distribute/ps_values.py
@@ -166,6 +166,14 @@ class AggregatingVariable(variables_lib.Variable, core.Tensor):
   def _gather_saveables_for_checkpoint(self):
     return {trackable.VARIABLE_VALUE_KEY: self._v}
 
+  def _map_resources(self):
+    """For implementing `Trackable`."""
+    # By delegating this method to the wrapped variable, SavedModel with
+    # AggregatingVariable are identical to SavedModel with normal variables.
+    obj_map, resource_map = self._v._map_resources()  # pylint:disable=protected-access
+    obj_map[self] = obj_map[self._v]
+    return obj_map, resource_map
+
   # pylint: disable=multiple-statements
   def __add__(self, o):
     return self._v + o
diff --git a/tensorflow/python/distribute/saved_model_test_base.py b/tensorflow/python/distribute/saved_model_test_base.py
index e544e51cddd..70ea582baff 100644
--- a/tensorflow/python/distribute/saved_model_test_base.py
+++ b/tensorflow/python/distribute/saved_model_test_base.py
@@ -58,6 +58,7 @@ strategies = [
     strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
     strategy_combinations.mirrored_strategy_with_two_gpus,
     strategy_combinations.tpu_strategy,
+    strategy_combinations.tpu_strategy_packed_var,
     strategy_combinations.central_storage_strategy_with_two_gpus,
 ]
 
diff --git a/tensorflow/python/distribute/strategy_combinations.py b/tensorflow/python/distribute/strategy_combinations.py
index 350b187f67f..1fa42cb8645 100644
--- a/tensorflow/python/distribute/strategy_combinations.py
+++ b/tensorflow/python/distribute/strategy_combinations.py
@@ -53,7 +53,11 @@ _did_connect_to_cluster = False
 
 
 # pylint: disable=missing-docstring
-def _get_tpu_strategy_creator(steps_per_run, use_single_core=False, **kwargs):
+def _get_tpu_strategy_creator(steps_per_run,
+                              use_single_core=False,
+                              enable_packed_variable=False,
+                              **kwargs):
+
   def _create_tpu_strategy():
     global _did_connect_to_cluster
 
@@ -87,10 +91,13 @@ def _get_tpu_strategy_creator(steps_per_run, use_single_core=False, **kwargs):
 
     # Steps per run is only supported in TF 1.x
     if tf2.enabled():
-      return tpu_lib.TPUStrategy(resolver, device_assignment, **kwargs)
+      strategy = tpu_lib.TPUStrategy(resolver, device_assignment, **kwargs)
     else:
-      return tpu_lib.TPUStrategyV1(resolver, steps_per_run,
-                                   device_assignment, **kwargs)
+      strategy = tpu_lib.TPUStrategyV1(resolver, steps_per_run,
+                                       device_assignment, **kwargs)
+    strategy._enable_packed_variable_in_eager_mode = enable_packed_variable  # pylint: disable=protected-access
+    return strategy
+
   return _create_tpu_strategy
 
 
@@ -117,6 +124,10 @@ one_device_strategy_gpu_on_worker_1 = combinations.NamedDistribution(
     required_gpus=1)
 tpu_strategy = combinations.NamedDistribution(
     "TPU", _get_tpu_strategy_creator(steps_per_run=2), required_tpu=True)
+tpu_strategy_packed_var = combinations.NamedDistribution(
+    "TPUPackedVar",
+    _get_tpu_strategy_creator(steps_per_run=2, enable_packed_variable=True),
+    required_tpu=True)
 tpu_strategy_one_step = combinations.NamedDistribution(
     "TPUOneStep", _get_tpu_strategy_creator(steps_per_run=1), required_tpu=True)
 tpu_strategy_one_core = combinations.NamedDistribution(
@@ -286,6 +297,7 @@ strategies_minus_default_and_tpu = [
 tpu_strategies = [
     tpu_strategy,  # steps_per_run=2
     tpu_strategy_one_step,
+    tpu_strategy_packed_var,
     cloud_tpu_strategy,
 ]
 
diff --git a/tensorflow/python/distribute/strategy_common_test.py b/tensorflow/python/distribute/strategy_common_test.py
index ed52a4794ee..7070fbbf18f 100644
--- a/tensorflow/python/distribute/strategy_common_test.py
+++ b/tensorflow/python/distribute/strategy_common_test.py
@@ -26,6 +26,8 @@ from tensorflow.python.distribute import multi_worker_test_base
 from tensorflow.python.distribute import reduce_util
 from tensorflow.python.distribute import strategy_combinations
 from tensorflow.python.distribute import strategy_test_lib
+from tensorflow.python.distribute.collective_all_reduce_strategy import CollectiveAllReduceStrategy
+from tensorflow.python.distribute.tpu_strategy import TPUStrategy
 from tensorflow.python.eager import def_function
 from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import array_ops
@@ -135,5 +137,38 @@ class DistributedCollectiveAllReduceStrategyTest(
   # worker strategy combinations can run on a fixed number of GPUs.
 
 
+class StrategyClusterResolverTest(test.TestCase, parameterized.TestCase):
+
+  @combinations.generate(
+      combinations.combine(
+          strategy=[strategy_combinations.multi_worker_mirrored_two_workers] +
+          strategy_combinations.all_strategies,
+          mode=['eager']))
+  def testClusterResolverProperty(self, strategy):
+    # CollectiveAllReduceStrategy and TPUStrategy must have a cluster resolver.
+    # `None` otherwise.
+    resolver = strategy.cluster_resolver
+    if not isinstance(strategy, CollectiveAllReduceStrategy) and not isinstance(
+        strategy, TPUStrategy):
+      self.assertIsNone(resolver)
+      return
+
+    with strategy.scope():
+      self.assertIs(strategy.cluster_resolver, resolver)
+    self.assertTrue(hasattr(resolver, 'cluster_spec'))
+    self.assertTrue(hasattr(resolver, 'environment'))
+    self.assertTrue(hasattr(resolver, 'master'))
+    self.assertTrue(hasattr(resolver, 'num_accelerators'))
+    self.assertIsNone(resolver.rpc_layer)
+    if isinstance(strategy, CollectiveAllReduceStrategy):
+      self.assertGreaterEqual(resolver.task_id, 0)
+      self.assertLessEqual(resolver.task_id, 1)
+      self.assertEqual(resolver.task_type, 'worker')
+    elif isinstance(strategy, TPUStrategy):
+      # TPUStrategy does not have task_id and task_type applicable.
+      self.assertIsNone(resolver.task_id)
+      self.assertIsNone(resolver.task_type)
+
+
 if __name__ == '__main__':
   combinations.main()
diff --git a/tensorflow/python/distribute/tpu_strategy.py b/tensorflow/python/distribute/tpu_strategy.py
index 9493ecce767..df393c61dbb 100644
--- a/tensorflow/python/distribute/tpu_strategy.py
+++ b/tensorflow/python/distribute/tpu_strategy.py
@@ -23,6 +23,7 @@ import collections
 import contextlib
 import copy
 import weakref
+from absl import logging
 
 import numpy as np
 
@@ -46,17 +47,20 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import device_spec
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.tpu import device_assignment as device_assignment_lib  # pylint: disable=unused-import
 from tensorflow.python.tpu import tpu
 from tensorflow.python.tpu import tpu_strategy_util
 from tensorflow.python.tpu import training_loop
 from tensorflow.python.tpu.ops import tpu_ops
+from tensorflow.python.util import deprecation
 from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import tf_export
 
@@ -97,9 +101,188 @@ def validate_run_function(fn):
         "eager behavior is enabled.")
 
 
+@tf_export("distribute.TPUStrategy", v1=[])
+class TPUStrategyV2(distribute_lib.Strategy):
+  """Synchronous training on TPUs and TPU Pods.
+
+  To construct a TPUStrategy object, you need to run the
+  initialization code as below:
+
+  >>> resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='')
+  >>> tf.config.experimental_connect_to_cluster(resolver)
+  >>> tf.tpu.experimental.initialize_tpu_system(resolver)
+  >>> strategy = tf.distribute.TPUStrategy(resolver)
+
+  While using distribution strategies, the variables created within the
+  strategy's scope will be replicated across all the replicas and can be kept in
+  sync using all-reduce algorithms.
+
+  To run TF2 programs on TPUs, you can either use `.compile` and
+  `.fit` APIs in `tf.keras` with TPUStrategy, or write your own customized
+  training loop by calling `strategy.run` directly. Note that
+  TPUStrategy doesn't support pure eager execution, so please make sure the
+  function passed into `strategy.run` is a `tf.function` or
+  `strategy.run` is called inside a `tf.function` if eager
+  behavior is enabled. See more details in https://www.tensorflow.org/guide/tpu.
+
+  `experimental_distribute_datasets_from_function` and
+  `experimental_distribute_dataset` APIs can be used to distribute the dataset
+  across the TPU workers when writing your own training loop. If you are using
+  `fit` and `compile` methods available in `tf.keras.Model`, then Keras will
+  handle the distribution for you.
+
+  An example of writing customized training loop on TPUs:
+
+  >>> with strategy.scope():
+  ...   model = tf.keras.Sequential([
+  ...     tf.keras.layers.Dense(2, input_shape=(5,)),
+  ...   ])
+  ...   optimizer = tf.keras.optimizers.SGD(learning_rate=0.1)
+
+  >>> def dataset_fn(ctx):
+  ...   x = np.random.random((2, 5)).astype(np.float32)
+  ...   y = np.random.randint(2, size=(2, 1))
+  ...   dataset = tf.data.Dataset.from_tensor_slices((x, y))
+  ...   return dataset.repeat().batch(1, drop_remainder=True)
+  >>> dist_dataset = strategy.experimental_distribute_datasets_from_function(
+  ...     dataset_fn)
+  >>> iterator = iter(dist_dataset)
+
+  >>> @tf.function()
+  ... def train_step(iterator):
+  ...
+  ...   def step_fn(inputs):
+  ...     features, labels = inputs
+  ...     with tf.GradientTape() as tape:
+  ...       logits = model(features, training=True)
+  ...       loss = tf.keras.losses.sparse_categorical_crossentropy(
+  ...           labels, logits)
+  ...
+  ...     grads = tape.gradient(loss, model.trainable_variables)
+  ...     optimizer.apply_gradients(zip(grads, model.trainable_variables))
+  ...
+  ...   strategy.run(step_fn, args=(next(iterator),))
+
+  >>> train_step(iterator)
+
+  For the advanced use cases like model parallelism, you can set
+  `experimental_device_assignment` argument when creating TPUStrategy to specify
+  number of replicas and number of logical devices. Below is an example to
+  initialize TPU system with 2 logical devices and 1 replica.
+
+  >>> resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='')
+  >>> tf.config.experimental_connect_to_cluster(resolver)
+  >>> topology = tf.tpu.experimental.initialize_tpu_system(resolver)
+  >>> device_assignment = tf.tpu.experimental.DeviceAssignment.build(
+  ...     topology,
+  ...     computation_shape=[1, 1, 1, 2],
+  ...     num_replicas=1)
+  >>> strategy = tf.distribute.TPUStrategy(
+  ...     resolver, experimental_device_assignment=device_assignment)
+
+  Then you can run a `tf.add` operation only on logical device 0.
+
+  >>> @tf.function()
+  ... def step_fn(inputs):
+  ...   features, _ = inputs
+  ...   output = tf.add(features, features)
+  ...
+  ...   # Add operation will be executed on logical device 0.
+  ...   output = strategy.experimental_assign_to_logical_device(output, 0)
+  ...   return output
+  >>> dist_dataset = strategy.experimental_distribute_datasets_from_function(
+  ...     dataset_fn)
+  >>> iterator = iter(dist_dataset)
+  >>> strategy.run(step_fn, args=(next(iterator),))
+  """
+
+  def __init__(self,
+               tpu_cluster_resolver=None,
+               experimental_device_assignment=None):
+    """Synchronous training in TPU donuts or Pods.
+
+    Args:
+      tpu_cluster_resolver: A tf.distribute.cluster_resolver.TPUClusterResolver,
+        which provides information about the TPU cluster. If None, it will
+        assume running on a local TPU worker.
+      experimental_device_assignment: Optional
+        `tf.tpu.experimental.DeviceAssignment` to specify the placement of
+        replicas on the TPU cluster.
+    """
+    super(TPUStrategyV2, self).__init__(TPUExtended(
+        self, tpu_cluster_resolver,
+        device_assignment=experimental_device_assignment))
+    distribute_lib.distribution_strategy_gauge.get_cell("V2").set("TPUStrategy")
+    distribute_lib.distribution_strategy_replica_gauge.get_cell(
+        "num_workers").set(self.extended.num_hosts)
+    distribute_lib.distribution_strategy_replica_gauge.get_cell(
+        "num_replicas_per_worker").set(self.extended.num_replicas_per_host)
+    # Packed variable is used to reduce the overhead of function execution.
+    # For a DistributedVariable, only one variable handle is captured into a
+    # function graph. It's only supported in eager mode.
+    self._enable_packed_variable_in_eager_mode = False
+
+  def run(self, fn, args=(), kwargs=None, options=None):
+    """Run the computation defined by `fn` on each TPU replica.
+
+    Executes ops specified by `fn` on each replica. If `args` or `kwargs` have
+    `tf.distribute.DistributedValues`, such as those produced by a
+    `tf.distribute.DistributedDataset` from
+    `tf.distribute.Strategy.experimental_distribute_dataset` or
+    `tf.distribute.Strategy.experimental_distribute_datasets_from_function`,
+    when `fn` is executed on a particular replica, it will be executed with the
+    component of `tf.distribute.DistributedValues` that correspond to that
+    replica.
+
+    `fn` may call `tf.distribute.get_replica_context()` to access members such
+    as `all_reduce`.
+
+    All arguments in `args` or `kwargs` should either be nest of tensors or
+    `tf.distribute.DistributedValues` containing tensors or composite tensors.
+
+    Example usage:
+
+    >>> resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='')
+    >>> tf.config.experimental_connect_to_cluster(resolver)
+    >>> tf.tpu.experimental.initialize_tpu_system(resolver)
+    >>> strategy = tf.distribute.TPUStrategy(resolver)
+    >>> @tf.function
+    ... def run():
+    ...   def value_fn(value_context):
+    ...     return value_context.num_replicas_in_sync
+    ...   distributed_values = (
+    ...       strategy.experimental_distribute_values_from_function(value_fn))
+    ...   def replica_fn(input):
+    ...     return input * 2
+    ...   return strategy.run(replica_fn, args=(distributed_values,))
+    >>> result = run()
+
+    Args:
+      fn: The function to run. The output must be a `tf.nest` of `Tensor`s.
+      args: (Optional) Positional arguments to `fn`.
+      kwargs: (Optional) Keyword arguments to `fn`.
+      options: (Optional) An instance of `tf.distribute.RunOptions` specifying
+        the options to run `fn`.
+
+    Returns:
+      Merged return value of `fn` across replicas. The structure of the return
+      value is the same as the return value from `fn`. Each element in the
+      structure can either be `tf.distribute.DistributedValues`, `Tensor`
+      objects, or `Tensor`s (for example, if running on a single replica).
+    """
+    validate_run_function(fn)
+
+    # Note: the target function is converted to graph even when in Eager mode,
+    # so autograph is on by default here.
+    fn = autograph.tf_convert(fn, autograph_ctx.control_status_ctx())
+    options = options or distribute_lib.RunOptions()
+    return self.extended.tpu_run(fn, args, kwargs, options)
+
+
 @tf_export("distribute.experimental.TPUStrategy", v1=[])
+@deprecation.deprecated_endpoints("distribute.experimental.TPUStrategy")
 class TPUStrategy(distribute_lib.Strategy):
-  """TPU distribution strategy implementation.
+  """Synchronous training on TPUs and TPU Pods.
 
   To construct a TPUStrategy object, you need to run the
   initialization code as below:
@@ -109,9 +292,9 @@ class TPUStrategy(distribute_lib.Strategy):
   >>> tf.tpu.experimental.initialize_tpu_system(resolver)
   >>> strategy = tf.distribute.experimental.TPUStrategy(resolver)
 
-  While using distribution strategies, the variables created within strategy's
-  scope will be replicated across all the replicas and can be kept in sync
-  using all-reduce algorithms.
+  While using distribution strategies, the variables created within the
+  strategy's scope will be replicated across all the replicas and can be kept in
+  sync using all-reduce algorithms.
 
   To run TF2 programs on TPUs, you can either use `.compile` and
   `.fit` APIs in `tf.keras` with TPUStrategy, or write your own customized
@@ -131,9 +314,12 @@ class TPUStrategy(distribute_lib.Strategy):
       tpu_cluster_resolver: A tf.distribute.cluster_resolver.TPUClusterResolver,
         which provides information about the TPU cluster.
       device_assignment: Optional `tf.tpu.experimental.DeviceAssignment` to
-        specify the placement of replicas on the TPU cluster. Currently only
-        supports the usecase of using a single core within a TPU cluster.
+        specify the placement of replicas on the TPU cluster.
     """
+    logging.warning(
+        "`tf.distribute.experimental.TPUStrategy` is deprecated, please use "
+        " the non experimental symbol `tf.distribute.TPUStrategy` instead.")
+
     super(TPUStrategy, self).__init__(TPUExtended(
         self, tpu_cluster_resolver, device_assignment=device_assignment))
     distribute_lib.distribution_strategy_gauge.get_cell("V2").set("TPUStrategy")
@@ -141,6 +327,10 @@ class TPUStrategy(distribute_lib.Strategy):
         "num_workers").set(self.extended.num_hosts)
     distribute_lib.distribution_strategy_replica_gauge.get_cell(
         "num_replicas_per_worker").set(self.extended.num_replicas_per_host)
+    # Packed variable is used to reduce the overhead of function execution.
+    # For a DistributedVariable, only one variable handle is captured into a
+    # function graph. It's only supported in eager mode.
+    self._enable_packed_variable_in_eager_mode = False
 
   # TODO(cjfj): Modify `_call_for_each_replica` in `TPUExtended` such that this
   # can use the default implementation.
@@ -155,6 +345,18 @@ class TPUStrategy(distribute_lib.Strategy):
     options = options or distribute_lib.RunOptions()
     return self.extended.tpu_run(fn, args, kwargs, options)
 
+  @property
+  def cluster_resolver(self):
+    """Returns the cluster resolver associated with this strategy.
+
+    `tf.distribute.experimental.TPUStrategy` provides the
+    associated `tf.distribute.cluster_resolver.ClusterResolver`. If the user
+    provides one in `__init__`, that instance is returned; if the user does
+    not, a default
+    `tf.distribute.cluster_resolver.TPUClusterResolver` is provided.
+    """
+    return self.extended._tpu_cluster_resolver  # pylint: disable=protected-access
+
 
 @tf_export(v1=["distribute.experimental.TPUStrategy"])
 class TPUStrategyV1(distribute_lib.StrategyV1):
@@ -185,6 +387,10 @@ class TPUStrategyV1(distribute_lib.StrategyV1):
         "num_workers").set(self.extended.num_hosts)
     distribute_lib.distribution_strategy_replica_gauge.get_cell(
         "num_replicas_per_worker").set(self.extended.num_replicas_per_host)
+    # Packed variable is used to reduce the overhead of function execution.
+    # For a DistributedVariable, only one variable handle is captured into a
+    # function graph. It's only supported in eager mode.
+    self._enable_packed_variable_in_eager_mode = False
 
   @property
   def steps_per_run(self):
@@ -323,24 +529,16 @@ class TPUExtended(distribute_lib.StrategyExtendedV1):
     self._require_static_shapes = True
 
     self.experimental_enable_get_next_as_optional = True
-    self._prefetch_to_device = True
 
     self._logical_device_stack = [0]
 
     if context.executing_eagerly():
       # In async remote eager, we want to sync the exectors before exiting the
       # program.
-      atexit.register(context.async_wait)
-
-  # TODO(bfontain): Remove once a proper dataset API exists for prefetching
-  # a dataset to multiple devices exists.
-  # If value is true, this forces prefetch of data to the host's memeory rather
-  # than the individual TPU device's memory. This is needed when using for TPU
-  # Embeddings as a) sparse tensors cannot be prefetched to the TPU device
-  # memory and b) TPU Embedding enqueue operation are CPU ops and this avoids
-  # a copy back to the host for dense tensors
-  def _set_prefetch_on_host(self, value):
-    self._prefetch_to_device = not value
+      def async_wait():
+        if context.context()._context_handle is not None:  # pylint: disable=protected-access
+          context.async_wait()
+      atexit.register(async_wait)
 
   def _validate_colocate_with_variable(self, colocate_with_variable):
     distribute_utils. validate_colocate(colocate_with_variable, self)
@@ -380,17 +578,32 @@ class TPUExtended(distribute_lib.StrategyExtendedV1):
         session)
 
   def _get_input_workers(self, options):
-    prefetch_to_device = self._prefetch_to_device
-    if options:
-      prefetch_to_device = options.experimental_prefetch_to_device
-    if prefetch_to_device:
+    if not options or options.experimental_prefetch_to_device:
       return input_lib.InputWorkers(
           tuple(self._device_input_worker_devices.items()))
     else:
       return input_lib.InputWorkers(
           tuple(self._host_input_worker_devices.items()))
 
+  def _check_spec(self, element_spec):
+    if isinstance(element_spec, values.PerReplicaSpec):
+      element_spec = element_spec._component_specs  # pylint: disable=protected-access
+    specs = nest.flatten_with_joined_string_paths(element_spec)
+    for path, spec in specs:
+      if isinstance(spec, (sparse_tensor.SparseTensorSpec,
+                           ragged_tensor.RaggedTensorSpec)):
+        raise ValueError(
+            "Found tensor {} with spec {}. TPUStrategy does not support "
+            "distributed datasets with device prefetch when using sparse or "
+            "ragged tensors. If you indend to use sparse or ragged tensors, "
+            "please pass a tf.distribute.InputOptions object with "
+            "experimental_prefetch_to_device set to False to your dataset "
+            "distribution function.".format(path, type(spec)))
+
   def _experimental_distribute_dataset(self, dataset, options):
+    if options is None or options.experimental_prefetch_to_device:
+      self._check_spec(dataset.element_spec)
+
     return input_lib.get_distributed_dataset(
         dataset,
         self._get_input_workers(options),
@@ -408,12 +621,17 @@ class TPUExtended(distribute_lib.StrategyExtendedV1):
           input_pipeline_id=i,
           num_replicas_in_sync=self._num_replicas_in_sync))
 
-    return input_lib.get_distributed_datasets_from_function(
+    distributed_dataset = input_lib.get_distributed_datasets_from_function(
         dataset_fn,
         input_workers,
         input_contexts,
         self._container_strategy())
 
+    # We can only check after the dataset_fn is called.
+    if options is None or options.experimental_prefetch_to_device:
+      self._check_spec(distributed_dataset.element_spec)
+    return distributed_dataset
+
   def _experimental_distribute_values_from_function(self, value_fn):
     per_replica_values = []
     for replica_id in range(self._num_replicas_in_sync):
@@ -671,20 +889,29 @@ class TPUExtended(distribute_lib.StrategyExtendedV1):
       return cross_device_ops_lib.reduce_non_distributed_value(
           reduce_op, value, destinations, self._num_replicas_in_sync)
 
+    value_list = value.values
+    # pylint: disable=protected-access
+    if isinstance(
+        value,
+        values.DistributedVariable) and value._packed_variable is not None:
+      value_list = tuple(
+          value._packed_variable.on_device(d)
+          for d in value._packed_variable.devices)
+    # pylint: enable=protected-access
+
     # Currently XLA op by op mode has a limit for the number of inputs for a
     # single op, thus we break one `add_n` op into a group of `add_n` ops to
     # work around the constraint.
     # TODO(cjfj): Detect when it is possible to use `cross_replica_sum`.
     if len(value.values) <= _XLA_OP_BY_OP_INPUTS_LIMIT:
-      output = math_ops.add_n(value.values)
+      output = math_ops.add_n(value_list)
     else:
-      output = array_ops.zeros_like(
-          value.values[0], dtype=value.values[0].dtype)
-      for i in range(0, len(value.values), _XLA_OP_BY_OP_INPUTS_LIMIT):
-        output += math_ops.add_n(value.values[i:i + _XLA_OP_BY_OP_INPUTS_LIMIT])
+      output = array_ops.zeros_like(value_list[0], dtype=value_list[0].dtype)
+      for i in range(0, len(value_list), _XLA_OP_BY_OP_INPUTS_LIMIT):
+        output += math_ops.add_n(value_list[i:i + _XLA_OP_BY_OP_INPUTS_LIMIT])
 
     if reduce_op == reduce_util.ReduceOp.MEAN:
-      output *= (1. / len(value.values))
+      output *= (1. / len(value_list))
 
     devices = cross_device_ops_lib.get_devices_from(destinations)
 
@@ -710,17 +937,28 @@ class TPUExtended(distribute_lib.StrategyExtendedV1):
       else:
         return (fn(var, *args, **kwargs),)
 
-    # Otherwise, we revert to MirroredStrategy behavior and update each variable
-    # directly.
+    # Otherwise, we revert to MirroredStrategy behavior and update the variable
+    # on each replica directly.
     updates = []
-    for i, v in enumerate(var.values):
+    values_and_devices = []
+    packed_var = var._packed_variable  # pylint: disable=protected-access
+    if packed_var is not None:
+      for device in packed_var.devices:
+        values_and_devices.append((packed_var, device))
+    else:
+      for value in var.values:
+        values_and_devices.append((value, value.device))
+
+    for i, value_and_device in enumerate(values_and_devices):
+      value = value_and_device[0]
+      device = value_and_device[1]
       name = "update_%d" % i
-      with ops.device(v.device), \
+      with ops.device(device), \
            distribute_lib.UpdateContext(i), \
            ops.name_scope(name):
         # If args and kwargs are not mirrored, the value is returned as is.
         updates.append(
-            fn(v, *distribute_utils.select_replica_mirrored(i, args),
+            fn(value, *distribute_utils.select_replica_mirrored(i, args),
                **distribute_utils.select_replica_mirrored(i, kwargs)))
     return distribute_utils.update_regroup(self, updates, group)
 
diff --git a/tensorflow/python/distribute/tpu_strategy_test.py b/tensorflow/python/distribute/tpu_strategy_test.py
index 6dd7de500e4..850981e073e 100644
--- a/tensorflow/python/distribute/tpu_strategy_test.py
+++ b/tensorflow/python/distribute/tpu_strategy_test.py
@@ -18,7 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python import keras
+from absl.testing import parameterized
+
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.distribute import distribution_strategy_context
@@ -43,6 +44,7 @@ from tensorflow.python.ops import embedding_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import variables
+from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.platform import flags
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.tpu import device_assignment as device_assignment_lib
@@ -65,14 +67,17 @@ def get_tpu_cluster_resolver():
   return resolver
 
 
-def get_tpu_strategy():
+def get_tpu_strategy(enable_packed_var=False):
   resolver = get_tpu_cluster_resolver()
   remote.connect_to_cluster(resolver)
   tpu_strategy_util.initialize_tpu_system(resolver)
-  return tpu_lib.TPUStrategy(resolver)
+  strategy = tpu_lib.TPUStrategy(resolver)
+  strategy._enable_packed_variable_in_eager_mode = enable_packed_var
+  return strategy
 
 
-class TPUStrategyTest(test.TestCase):
+# TPU tests which don't use TPUStrategy.
+class TPUTest(test.TestCase):
 
   def test_multiple_initialize_system(self):
     resolver = get_tpu_cluster_resolver()
@@ -83,177 +88,6 @@ class TPUStrategyTest(test.TestCase):
       tpu_strategy_util.initialize_tpu_system(resolver)
       self.assertRegex(str(mock_log.call_args), "already been initialized")
 
-  def test_sequential_experimental_runs(self):
-    resolver = get_tpu_cluster_resolver()
-    remote.connect_to_cluster(resolver)
-    topology = tpu_strategy_util.initialize_tpu_system(resolver)
-    # Computation replicated to all cores.
-    device_assignment = device_assignment_lib.DeviceAssignment.build(
-        topology, num_replicas=2)
-    strategy = tpu_lib.TPUStrategy(
-        resolver, device_assignment=device_assignment)
-
-    # Computation on the 1st core.
-    device_assignment2 = device_assignment_lib.DeviceAssignment.build(
-        topology, num_replicas=1)
-    strategy2 = tpu_lib.TPUStrategy(
-        resolver, device_assignment=device_assignment2)
-
-    def computation(x):
-      return math_ops.square(x)
-
-    @def_function.function
-    def train_step():
-      outputs = strategy.experimental_local_results(
-          strategy.run(computation, args=([2., 2.],)))
-      outputs2 = strategy2.run(
-          computation, args=([outputs[0]],))
-      return outputs2
-
-    self.assertAllEqual([[16., 16.]], train_step())
-
-  def test_device_switch_case(self):
-    strategy = get_tpu_strategy()
-    with strategy.scope():
-      a = variables.Variable(1)
-
-    inference_iteration = variables.Variable(-1)
-
-    def inference_fn(x, i):
-      return a + x + i
-
-    @def_function.function
-    def run_inference(x):
-
-      def do_inference(device, inference_fn, i):
-        with ops.device(device):
-          return inference_fn(x, i)
-
-      branch_fns = {
-          0: (lambda: do_inference("/device:TPU:0", inference_fn, 0)),
-          1: (lambda: do_inference("/device:TPU:1", inference_fn, 1)),
-      }
-      branch_index = inference_iteration.assign_add(1, use_locking=True) % 2
-      return control_flow_ops.switch_case(branch_index, branch_fns)
-
-    self.assertAllEqual(2., run_inference(1))  # Use TPU core 0.
-    self.assertAllEqual(3., run_inference(1))  # Use TPU core 1.
-
-  def test_recover_from_compilation_failures(self):
-    # TODO(b/148150981): Stop skipping this test once recovery works
-    # for non-local TPU.
-    if FLAGS.tpu:
-      self.skipTest("Recovery fails for non-local TPU, see b/148150981")
-
-    # Disable automatic outside compilation.
-    config.set_soft_device_placement(False)
-    strategy = get_tpu_strategy()
-
-    @def_function.function
-    def compilation_failure_run():
-
-      def computation():
-        return random_ops.random_gamma([10], [0.5, 1.5])
-
-      return strategy.run(computation)
-
-    with self.assertRaisesRegexp(errors.InvalidArgumentError,
-                                 "TPU compilation failed"):
-      compilation_failure_run()
-
-    @def_function.function
-    def good_run():
-
-      def computation():
-        return random_ops.random_normal([10])
-
-      return strategy.run(computation)
-
-    good_run()
-
-  def test_dynamic_shape_with_outside_compilation_failure(self):
-    # Enable automatic outside compilation.
-    config.set_soft_device_placement(True)
-    strategy = get_tpu_strategy()
-    dataset = dataset_ops.Dataset.from_tensors(("string", 1.0)).repeat().batch(
-        2, drop_remainder=False)
-    dataset = strategy.experimental_distribute_dataset(dataset)
-    iterator = iter(dataset)
-
-    @def_function.function
-    def train_fn(iterator):
-
-      def step_fn(inputs):
-        _, inputs = inputs
-        return math_ops.reduce_sum(inputs)
-
-      return strategy.experimental_local_results(
-          strategy.run(step_fn, args=(next(iterator),)))
-
-    with self.assertRaisesRegex(errors.InternalError, "Compilation failure"):
-      logging.info(train_fn(iterator))
-
-  def test_computation_on_subset_cores(self):
-    resolver = get_tpu_cluster_resolver()
-    remote.connect_to_cluster(resolver)
-    topology = tpu_strategy_util.initialize_tpu_system(resolver)
-    all_core_strategy = tpu_lib.TPUStrategy(resolver)
-
-    with all_core_strategy.scope():
-      v = variables.Variable(0.0,
-                             aggregation=variables.VariableAggregation.MEAN)
-
-    # Computation on the 1st core.
-    device_assignment = device_assignment_lib.DeviceAssignment.build(
-        topology, num_replicas=1)
-    first_core_strategy = tpu_lib.TPUStrategy(
-        resolver, device_assignment=device_assignment)
-
-    # Computation on the 2nd core.
-    device_assignment2 = device_assignment_lib.DeviceAssignment(
-        topology, [[[0, 0, 0, 1]]])
-    second_core_strategy = tpu_lib.TPUStrategy(
-        resolver, device_assignment=device_assignment2)
-
-    @def_function.function
-    def train_step():
-
-      def step_fn():
-        return v + 1.0
-
-      all_core_strategy.run(step_fn)
-      r1 = first_core_strategy.run(step_fn)
-      r2 = second_core_strategy.run(step_fn)
-      return r1 + r2
-
-    train_step()
-    self.assertAllEqual(2., train_step())
-
-  def test_worker_devices_on_subset_cores(self):
-    resolver = get_tpu_cluster_resolver()
-    remote.connect_to_cluster(resolver)
-    topology = tpu_strategy_util.initialize_tpu_system(resolver)
-
-    # Strategy for the 1st core.
-    device_assignment = device_assignment_lib.DeviceAssignment.build(
-        topology, num_replicas=1)
-    first_core_strategy = tpu_lib.TPUStrategy(
-        resolver, device_assignment=device_assignment)
-
-    # Strategy for the 2nd core.
-    device_assignment2 = device_assignment_lib.DeviceAssignment(
-        topology, [[[0, 0, 0, 1]]])
-    second_core_strategy = tpu_lib.TPUStrategy(
-        resolver, device_assignment=device_assignment2)
-
-    self.assertLen(first_core_strategy.extended.worker_devices, 1)
-    self.assertEndsWith(first_core_strategy.extended.worker_devices[0],
-                        "device:TPU:0")
-
-    self.assertLen(second_core_strategy.extended.worker_devices, 1)
-    self.assertEndsWith(second_core_strategy.extended.worker_devices[0],
-                        "device:TPU:1")
-
   def test_tpu_tf_function_same_device(self):
     with ops.device("/device:TPU:0"):
       a = variables.Variable(1)
@@ -289,8 +123,203 @@ class TPUStrategyTest(test.TestCase):
       result = bar() + 1
       self.assertAllEqual(result, 2)
 
-  def test_control_output_in_while_body_fn(self):
-    strategy = get_tpu_strategy()
+  def test_on_demand_op_with_dynamic_output(self):
+    with ops.device("/device:TPU:0"):
+      where_output = array_ops.where([True, False, True])
+    self.assertAllEqual(where_output, [[0], [2]])
+
+    with ops.device("/device:TPU:0"):
+      repeat_output = array_ops.repeat(math_ops.range(2), [1, 4])
+    self.assertAllEqual(repeat_output, [0, 1, 1, 1, 1])
+
+
+@parameterized.named_parameters([("PackedVar", True), ("", False)])
+class TPUStrategyTest(test.TestCase, parameterized.TestCase):
+
+  def test_sequential_experimental_runs(self, enable_packed_var):
+    resolver = get_tpu_cluster_resolver()
+    remote.connect_to_cluster(resolver)
+    topology = tpu_strategy_util.initialize_tpu_system(resolver)
+    # Computation replicated to all cores.
+    device_assignment = device_assignment_lib.DeviceAssignment.build(
+        topology, num_replicas=2)
+    strategy = tpu_lib.TPUStrategy(
+        resolver, device_assignment=device_assignment)
+    strategy._enable_packed_variable_in_eager_mode = enable_packed_var
+
+    # Computation on the 1st core.
+    device_assignment2 = device_assignment_lib.DeviceAssignment.build(
+        topology, num_replicas=1)
+    strategy2 = tpu_lib.TPUStrategy(
+        resolver, device_assignment=device_assignment2)
+
+    def computation(x):
+      return math_ops.square(x)
+
+    @def_function.function
+    def train_step():
+      outputs = strategy.experimental_local_results(
+          strategy.run(computation, args=([2., 2.],)))
+      outputs2 = strategy2.run(
+          computation, args=([outputs[0]],))
+      return outputs2
+
+    self.assertAllEqual([[16., 16.]], train_step())
+
+  def test_device_switch_case(self, enable_packed_var):
+    strategy = get_tpu_strategy(enable_packed_var)
+    with strategy.scope():
+      a = variables.Variable(1)
+
+    inference_iteration = variables.Variable(-1)
+
+    def inference_fn(x, i):
+      return a + x + i
+
+    @def_function.function
+    def run_inference(x):
+
+      def do_inference(device, inference_fn, i):
+        with ops.device(device):
+          return inference_fn(x, i)
+
+      branch_fns = {
+          0: (lambda: do_inference("/device:TPU:0", inference_fn, 0)),
+          1: (lambda: do_inference("/device:TPU:1", inference_fn, 1)),
+      }
+      branch_index = inference_iteration.assign_add(1, use_locking=True) % 2
+      return control_flow_ops.switch_case(branch_index, branch_fns)
+
+    self.assertAllEqual(2., run_inference(1))  # Use TPU core 0.
+    self.assertAllEqual(3., run_inference(1))  # Use TPU core 1.
+
+  def test_recover_from_compilation_failures(self, enable_packed_var):
+    # TODO(b/148150981): Stop skipping this test once recovery works
+    # for non-local TPU.
+    if FLAGS.tpu:
+      self.skipTest("Recovery fails for non-local TPU, see b/148150981")
+
+    # Disable automatic outside compilation.
+    config.set_soft_device_placement(False)
+    strategy = get_tpu_strategy(enable_packed_var)
+
+    @def_function.function
+    def compilation_failure_run():
+
+      def computation():
+        return random_ops.random_gamma([10], [0.5, 1.5])
+
+      return strategy.run(computation)
+
+    with self.assertRaisesRegex(errors.InvalidArgumentError,
+                                "TPU compilation failed"):
+      compilation_failure_run()
+
+    @def_function.function
+    def good_run():
+
+      def computation():
+        return random_ops.random_normal([10])
+
+      return strategy.run(computation)
+
+    good_run()
+
+  def test_dynamic_shape_with_outside_compilation_failure(
+      self, enable_packed_var):
+    # Enable automatic outside compilation.
+    config.set_soft_device_placement(True)
+    strategy = get_tpu_strategy(enable_packed_var)
+    dataset = dataset_ops.Dataset.from_tensors(("string", 1.0)).repeat().batch(
+        2, drop_remainder=False)
+    dataset = strategy.experimental_distribute_dataset(dataset)
+    iterator = iter(dataset)
+
+    @def_function.function
+    def train_fn(iterator):
+
+      def step_fn(inputs):
+        _, inputs = inputs
+        return math_ops.reduce_sum(inputs)
+
+      return strategy.experimental_local_results(
+          strategy.run(step_fn, args=(next(iterator),)))
+
+    with self.assertRaisesRegex(errors.InternalError, "Compilation failure"):
+      logging.info(train_fn(iterator))
+
+  def test_computation_on_subset_cores(self, enable_packed_var):
+    resolver = get_tpu_cluster_resolver()
+    remote.connect_to_cluster(resolver)
+    topology = tpu_strategy_util.initialize_tpu_system(resolver)
+    all_core_strategy = tpu_lib.TPUStrategy(resolver)
+    all_core_strategy._enable_packed_variable_in_eager_mode = enable_packed_var
+
+    with all_core_strategy.scope():
+      v = variables.Variable(0.0,
+                             aggregation=variables.VariableAggregation.MEAN)
+
+    # Computation on the 1st core.
+    device_assignment = device_assignment_lib.DeviceAssignment.build(
+        topology, num_replicas=1)
+    first_core_strategy = tpu_lib.TPUStrategy(
+        resolver, device_assignment=device_assignment)
+    first_core_strategy._enable_packed_variable_in_eager_mode = (
+        enable_packed_var)
+
+    # Computation on the 2nd core.
+    device_assignment2 = device_assignment_lib.DeviceAssignment(
+        topology, [[[0, 0, 0, 1]]])
+    second_core_strategy = tpu_lib.TPUStrategy(
+        resolver, device_assignment=device_assignment2)
+    second_core_strategy._enable_packed_variable_in_eager_mode = (
+        enable_packed_var)
+
+    @def_function.function
+    def train_step():
+
+      def step_fn():
+        return v + 1.0
+
+      all_core_strategy.run(step_fn)
+      r1 = first_core_strategy.run(step_fn)
+      r2 = second_core_strategy.run(step_fn)
+      return r1 + r2
+
+    train_step()
+    self.assertAllEqual(2., train_step())
+
+  def test_worker_devices_on_subset_cores(self, enable_packed_var):
+    resolver = get_tpu_cluster_resolver()
+    remote.connect_to_cluster(resolver)
+    topology = tpu_strategy_util.initialize_tpu_system(resolver)
+
+    # Strategy for the 1st core.
+    device_assignment = device_assignment_lib.DeviceAssignment.build(
+        topology, num_replicas=1)
+    first_core_strategy = tpu_lib.TPUStrategy(
+        resolver, device_assignment=device_assignment)
+    first_core_strategy._enable_packed_variable_in_eager_mode = (
+        enable_packed_var)
+
+    # Strategy for the 2nd core.
+    device_assignment2 = device_assignment_lib.DeviceAssignment(
+        topology, [[[0, 0, 0, 1]]])
+    second_core_strategy = tpu_lib.TPUStrategy(
+        resolver, device_assignment=device_assignment2)
+    second_core_strategy._enable_packed_variable_in_eager_mode = (
+        enable_packed_var)
+
+    self.assertLen(first_core_strategy.extended.worker_devices, 1)
+    self.assertEndsWith(first_core_strategy.extended.worker_devices[0],
+                        "device:TPU:0")
+
+    self.assertLen(second_core_strategy.extended.worker_devices, 1)
+    self.assertEndsWith(second_core_strategy.extended.worker_devices[0],
+                        "device:TPU:1")
+
+  def test_control_output_in_while_body_fn(self, enable_packed_var):
+    strategy = get_tpu_strategy(enable_packed_var)
 
     with strategy.scope():
       v = variables.Variable(
@@ -308,8 +337,8 @@ class TPUStrategyTest(test.TestCase):
     train_step()
     self.assertEqual(2.0, v.numpy())
 
-  def test_cluster_in_graph_and_while_body_fn(self):
-    strategy = get_tpu_strategy()
+  def test_cluster_in_graph_and_while_body_fn(self, enable_packed_var):
+    strategy = get_tpu_strategy(enable_packed_var)
 
     @def_function.function
     def train_step():
@@ -329,8 +358,8 @@ class TPUStrategyTest(test.TestCase):
     sum_val = train_step().numpy().astype(float)
     self.assertEqual(sum_val, strategy.num_replicas_in_sync * 10)
 
-  def test_two_clusters_with_same_fn(self):
-    strategy = get_tpu_strategy()
+  def test_two_clusters_with_same_fn(self, enable_packed_var):
+    strategy = get_tpu_strategy(enable_packed_var)
 
     @def_function.function
     def foo(x):
@@ -343,8 +372,8 @@ class TPUStrategyTest(test.TestCase):
 
     bar(1)
 
-  def test_using_external_variable_inside_tf_function(self):
-    strategy = get_tpu_strategy()
+  def test_using_external_variable_inside_tf_function(self, enable_packed_var):
+    strategy = get_tpu_strategy(enable_packed_var)
     dataset = dataset_ops.Dataset.range(
         strategy.num_replicas_in_sync * 2,
         output_type=dtypes.float32).batch(strategy.num_replicas_in_sync)
@@ -364,27 +393,9 @@ class TPUStrategyTest(test.TestCase):
         expected_result,
         strategy.experimental_local_results(train_step(next(input_iterator))))
 
-  def test_keras_metric_outside_strategy_scope_per_replica(self):
-    strategy = get_tpu_strategy()
-    metric = keras.metrics.Mean("test_metric", dtype=dtypes.float32)
-
-    dataset = dataset_ops.Dataset.range(strategy.num_replicas_in_sync *
-                                        2).batch(2)
-    dataset = strategy.experimental_distribute_dataset(dataset)
-
-    @def_function.function
-    def step_fn(i):
-      metric.update_state(i)
-
-    with self.assertRaisesRegex(ValueError, "Trying to run metric.update_state "
-                                            "in replica context"):
-      with strategy.scope():
-        for i in dataset:
-          strategy.run(step_fn, args=(i,))
-
   # TODO(b/145574622): Remove this test once it is re-enabled in values_test.py.
-  def test_all_reduce_on_sync_on_read_variable(self):
-    strategy = get_tpu_strategy()
+  def test_all_reduce_on_sync_on_read_variable(self, enable_packed_var):
+    strategy = get_tpu_strategy(enable_packed_var)
     dataset = dataset_ops.Dataset.range(
         strategy.num_replicas_in_sync, output_type=dtypes.float32).batch(
             strategy.num_replicas_in_sync, drop_remainder=True)
@@ -423,8 +434,8 @@ class TPUStrategyTest(test.TestCase):
     self.assertAllEqual((0.,), w.read_value())
 
   # TODO(b/140633529): Re-enable the test.
-  def disable_test_experimental_run_output_on_device(self):
-    strategy = get_tpu_strategy()
+  def disable_test_experimental_run_output_on_device(self, enable_packed_var):
+    strategy = get_tpu_strategy(enable_packed_var)
 
     def computation(x):
       return math_ops.square(x)
@@ -442,8 +453,8 @@ class TPUStrategyTest(test.TestCase):
     self.assertAllEqual("/job:localhost/replica:0/task:0/device:TPU:1",
                         results[1].backing_device)
 
-  def test_composite_input(self):
-    strategy = get_tpu_strategy()
+  def test_composite_input(self, enable_packed_var):
+    strategy = get_tpu_strategy(enable_packed_var)
     if strategy.num_replicas_in_sync != 2:
       self.skipTest("Test assumes two replicas.")
 
@@ -474,16 +485,19 @@ class TPUStrategyTest(test.TestCase):
 
       return dataset.map(make_sparse)
 
-    strategy.extended._set_prefetch_on_host(True)  # pylint: disable=protected-access
     dataset = iter(
-        strategy.experimental_distribute_datasets_from_function(dataset_fn))
+        strategy.experimental_distribute_datasets_from_function(
+            dataset_fn,
+            distribute_lib.InputOptions(
+                experimental_prefetch_to_device=False)))
 
     result = sparse_lookup(dataset)
     self.assertAllEqual(result,
                         [[[0.0, 1.0], [3.0, 8.0]], [[0.0, 1.0], [3.0, 8.0]]])
 
-  def test_composite_input_dynamic_shapes_outside_compilation(self):
-    strategy = get_tpu_strategy()
+  def test_composite_input_dynamic_shapes_outside_compilation(
+      self, enable_packed_var):
+    strategy = get_tpu_strategy(enable_packed_var)
     if strategy.num_replicas_in_sync != 2:
       self.skipTest("Test assumes two replicas.")
 
@@ -518,18 +532,20 @@ class TPUStrategyTest(test.TestCase):
 
       return dataset.map(make_sparse)
 
-    strategy.extended._set_prefetch_on_host(True)  # pylint: disable=protected-access
     dataset = iter(
-        strategy.experimental_distribute_datasets_from_function(dataset_fn))
+        strategy.experimental_distribute_datasets_from_function(
+            dataset_fn,
+            options=distribute_lib.InputOptions(
+                experimental_prefetch_to_device=False)))
 
     result = sparse_lookup(dataset)
     self.assertAllEqual(result, [[0.0, 2.0], [1.5, 5.0]])
 
-  def test_per_device_tracing_of_mirrored_variables(self):
+  def test_per_device_tracing_of_mirrored_variables(self, enable_packed_var):
     # Define trace_count as a list to avoid python scoping error
     trace_count = [0]
 
-    strategy = get_tpu_strategy()
+    strategy = get_tpu_strategy(enable_packed_var)
     with strategy.scope():
       variable = variables.Variable(0.0)
 
@@ -546,7 +562,17 @@ class TPUStrategyTest(test.TestCase):
 
     with strategy.scope():
       update_variable.get_concrete_function()
-      self.assertEqual(trace_count[0], len(strategy.extended.worker_devices))
+      self.assertLen(strategy.extended.worker_devices, trace_count[0])
+
+  def test_cluster_resolver_available(self, enable_packed_var):
+    resolver = get_tpu_cluster_resolver()
+    remote.connect_to_cluster(resolver)
+    tpu_strategy_util.initialize_tpu_system(resolver)
+    strategy = tpu_lib.TPUStrategy(resolver)
+    self.assertIs(strategy.cluster_resolver, resolver)
+
+
+class TPUStrategyDataPrefetchTest(test.TestCase):
 
   def test_prefetch_to_device_default(self):
     strategy = get_tpu_strategy()
@@ -589,5 +615,61 @@ class TPUStrategyTest(test.TestCase):
         dataset_item.values[0].device)
     self.assertEqual(dataset_location.device_type, "CPU")
 
+  def test_prefetch_to_device_sparse_dataset(self):
+    strategy = get_tpu_strategy()
+    # Values here aren't important.
+    dataset = dataset_ops.Dataset.from_tensors(
+        sparse_tensor.SparseTensor(indices=[[0, 0], [0, 1], [1, 0]],
+                                   values=[1, 2, 3],
+                                   dense_shape=[2, 2]))
+    dataset = dataset.repeat()
+    dataset = dataset.batch(strategy.num_replicas_in_sync)
+
+    with self.assertRaisesRegex(ValueError, "TPUStrategy does not support"):
+      iter(strategy.experimental_distribute_dataset(dataset))
+
+  def test_prefetch_to_device_ragged_dataset(self):
+    strategy = get_tpu_strategy()
+    # Values here aren't important.
+    dataset = dataset_ops.Dataset.from_tensors(
+        ragged_tensor.RaggedTensor.from_row_splits(
+            values=[1, 2, 3],
+            row_splits=[0, 2, 3]))
+    dataset = dataset.repeat()
+    dataset = dataset.batch(strategy.num_replicas_in_sync)
+
+    with self.assertRaisesRegex(ValueError, "TPUStrategy does not support"):
+      iter(strategy.experimental_distribute_dataset(dataset))
+
+  def test_prefetch_to_device_sparse_dataset_fn(self):
+    strategy = get_tpu_strategy()
+    def dataset_fn(ctx):
+      del ctx
+      # Values here aren't important.
+      dataset = dataset_ops.Dataset.from_tensors(
+          sparse_tensor.SparseTensor(indices=[[0, 0], [0, 1], [1, 0]],
+                                     values=[1, 2, 3],
+                                     dense_shape=[2, 2]))
+      dataset = dataset.repeat()
+      return dataset.batch(strategy.num_replicas_in_sync)
+
+    with self.assertRaisesRegex(ValueError, "TPUStrategy does not support"):
+      iter(strategy.experimental_distribute_datasets_from_function(dataset_fn))
+
+  def test_prefetch_to_device_ragged_dataset_fn(self):
+    strategy = get_tpu_strategy()
+    def dataset_fn(ctx):
+      del ctx
+      # Values here aren't important.
+      dataset = dataset_ops.Dataset.from_tensors(
+          ragged_tensor.RaggedTensor.from_row_splits(
+              values=[1, 2, 3],
+              row_splits=[0, 2, 3]))
+      dataset = dataset.repeat()
+      return dataset.batch(strategy.num_replicas_in_sync)
+
+    with self.assertRaisesRegex(ValueError, "TPUStrategy does not support"):
+      iter(strategy.experimental_distribute_datasets_from_function(dataset_fn))
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/distribute/tpu_values.py b/tensorflow/python/distribute/tpu_values.py
index 40ab058ac7c..33885531966 100644
--- a/tensorflow/python/distribute/tpu_values.py
+++ b/tensorflow/python/distribute/tpu_values.py
@@ -24,6 +24,7 @@ from __future__ import print_function
 
 import contextlib
 
+from tensorflow.python.distribute import packed_distributed_variable as packed
 from tensorflow.python.distribute import values
 from tensorflow.python.eager import context
 from tensorflow.python.eager import tape
@@ -46,15 +47,27 @@ def _maybe_enter_graph(tensor):
       yield
 
 
+@contextlib.contextmanager
+def _maybe_on_device(var):
+  # Add a device scope for packed variables.
+  if isinstance(var, packed.PackedVarAndDevice):
+    with ops.device(var.device):
+      yield
+  else:
+    yield
+
+
 def _make_raw_assign_fn(raw_assign_fn):  # pylint: disable=missing-docstring
 
   def assign_fn(var, value, use_locking=False, name=None, read_value=True):  # pylint: disable=missing-docstring
     del use_locking  # Unused.
 
-    with _maybe_enter_graph(var.handle):
+    handle = var.handle
+    with _maybe_enter_graph(handle), _maybe_on_device(var):
       op = raw_assign_fn(
-          var.handle, ops.convert_to_tensor(value, dtype=var.dtype), name=name)
-
+          handle,
+          ops.convert_to_tensor(value, dtype=var.dtype),
+          name=name)
       with ops.control_dependencies([op]):
         return var._read_variable_op() if read_value else op  # pylint: disable=protected-access
 
@@ -97,23 +110,37 @@ class TPUVariableMixin(object):
 
   @property
   def handle(self):
+    """The handle by which this variable can be accessed."""
     # If we're in a tpu.rewrite(), return the replicated handle.
     tpu_context = enclosing_tpu_context()
     if tpu_context is None or context.executing_eagerly():
       return self._get_on_device_or_primary().handle
     else:
-      return tpu_context.get_replicated_var_handle(self._handle_id,
-                                                   self._values,
-                                                   self._is_mirrored())
+      is_packed = self._packed_var is not None
+      val = self._values
+      if is_packed:
+        val = [self._packed_var]
+
+      return tpu_context.get_replicated_var_handle(self._handle_id, val,
+                                                   self._is_mirrored(),
+                                                   is_packed)
 
   @property
   def device(self):
     return self.handle.device
 
   def _read_variable_op(self):
+    """Reads the value of this variable."""
     if self.trainable:
       tape.variable_accessed(self)
-    return gen_resource_variable_ops.read_variable_op(self.handle, self.dtype)
+
+    handle = self.handle
+    if getattr(handle, "is_packed", False):
+      # Add a device scope for a packed variable handle.
+      with ops.device(self._get_on_device_or_primary().device):
+        return gen_resource_variable_ops.read_variable_op(handle, self.dtype)
+    else:
+      return gen_resource_variable_ops.read_variable_op(handle, self.dtype)
 
   def read_value(self):
     if enclosing_tpu_context() is None:
diff --git a/tensorflow/python/distribute/values.py b/tensorflow/python/distribute/values.py
index c6e0eb34a7b..37643e03b18 100644
--- a/tensorflow/python/distribute/values.py
+++ b/tensorflow/python/distribute/values.py
@@ -32,6 +32,7 @@ from tensorflow.python.framework import type_spec
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variable_scope as vs
 from tensorflow.python.ops import variables as variables_lib
 from tensorflow.python.training.saving import saveable_object
@@ -75,20 +76,21 @@ def _on_write_update_replica(var, update_fn, value, **kwargs):
 class DistributedValues(object):
   """Base class for representing distributed values.
 
-  A subclass instance of DistributedValues is created when creating variables
-  within a distribution strategy, iterating a `tf.Dataset` or through
-  `strategy.run`.  This base class should never be instantiated
-  directly.  DistributedValues contains a value per replica.  Depending on
+  A subclass instance of `tf.distribute.DistributedValues` is created when
+  creating variables within a distribution strategy, iterating a
+  `tf.distribute.DistributedDataset` or through `tf.distribute.Strategy.run`.
+  This base class should never be instantiated directly.
+  `tf.distribute.DistributedValues` contains a value per replica. Depending on
   the subclass, the values could either be synced on update, synced on demand,
   or never synced.
 
-  DistributedValues can be reduced to obtain single value across replicas,
-  as input into `run` or the per replica values inspected
-  using `experimental_local_results`.
+  `tf.distribute.DistributedValues` can be reduced to obtain single value across
+  replicas, as input into `tf.distribute.Strategy.run` or the per-replica values
+  inspected using `tf.distribute.Strategy.experimental_local_results`.
 
   Example usage:
 
-  1. Created from Dataset:
+  1. Created from a `tf.distribute.DistributedDataset`:
 
   >>> strategy = tf.distribute.MirroredStrategy()
   >>> dataset = tf.data.Dataset.from_tensor_slices([5., 6., 7., 8.]).batch(2)
@@ -470,6 +472,12 @@ class DistributedVariable(DistributedDelegate, variables_lib.Variable,
     # variable.
     self._var_policy = var_policy
 
+  @property
+  def _devices(self):
+    if self._packed_var is not None:
+      return tuple(d for d in self._packed_var.devices)
+    return tuple(v.device for v in self._values)
+
   def is_initialized(self, name=None):
     """Identifies if all the component variables are initialized.
 
@@ -480,6 +488,8 @@ class DistributedVariable(DistributedDelegate, variables_lib.Variable,
       The op that evaluates to True or False depending on if all the
       component variables are initialized.
     """
+    if self._packed_var is not None:
+      return self._packed_var.is_initialized()
     result = self._primary.is_initialized()
     # We iterate through the list of values except the last one to allow us to
     # name the final `logical_and` op the same name that is passed by the user
@@ -550,6 +560,10 @@ class DistributedVariable(DistributedDelegate, variables_lib.Variable,
   def aggregation(self):
     return self._aggregation
 
+  @property
+  def _packed_variable(self):
+    return self._packed_var
+
   @property
   def handle(self):
     replica_id = values_util.get_current_replica_id_as_int()
@@ -557,6 +571,8 @@ class DistributedVariable(DistributedDelegate, variables_lib.Variable,
       raise ValueError("`handle` is not available outside the replica context"
                        " or a `tf.distribute.Strategy.update()` call.")
     else:
+      if self._packed_var is not None:
+        return self._packed_var.handle
       return self._values[replica_id].handle
 
   def eval(self, session=None):
@@ -605,6 +621,33 @@ class DistributedVariable(DistributedDelegate, variables_lib.Variable,
   def _in_graph_mode(self):
     return self._primary._in_graph_mode  # pylint: disable=protected-access
 
+  def _get_replica(self, replica_id):
+    """Returns the value on a device with the given replica_id."""
+    if self._packed_var is not None:
+      return self._packed_var.on_device(self._devices[replica_id])
+    return self._values[replica_id]
+
+  def _get(self):
+    """Returns the value for the current device or raises a ValueError."""
+    replica_id = values_util.get_current_replica_id_as_int()
+    if replica_id is None:
+      return self._get_cross_replica()
+    else:
+      return self._get_replica(replica_id)
+
+  def _get_on_device_or_primary(self):
+    """Returns value in same replica or device if possible, else the _primary."""
+    replica_id = values_util.get_current_replica_id_as_int()
+    if replica_id is None:
+      # Try to find a value on the current device.
+      current_device = device_util.canonicalize(device_util.current())
+      for i, value in enumerate(self._values):
+        if device_util.canonicalize(value.device) == current_device:
+          return self._get_replica(i)
+      return self._get_replica(0)
+    else:
+      return self._get_replica(replica_id)
+
   def read_value(self):
     with ds_context.enter_or_assert_strategy(self._distribute_strategy):
       return array_ops.identity(self._get())
@@ -776,7 +819,8 @@ class DistributedVariable(DistributedDelegate, variables_lib.Variable,
       if ds_context.in_cross_replica_context():
         update_replica_id = distribute_lib.get_update_replica_id()
         if update_replica_id is not None:
-          return update_fn(self._values[update_replica_id], value, **kwargs)
+          replica_value = self._get_replica(update_replica_id)
+          return update_fn(replica_value, value, **kwargs)
         return self._update_cross_replica(update_fn, value, **kwargs)
       else:
         values_util.assert_replica_context(self.distribute_strategy)
@@ -792,6 +836,18 @@ class DistributedVariable(DistributedDelegate, variables_lib.Variable,
       return ops.convert_to_tensor(
           self._get(), dtype=dtype, name=name, as_ref=as_ref)
 
+  def _map_resources(self):
+    """For implementing `Trackable`."""
+    new_obj = resource_variable_ops.copy_to_graph_uninitialized(self._primary)
+    obj_map, resource_map = {}, {}
+    for v in self._values:
+      obj_map[v] = new_obj
+      resource_map[v.handle] = new_obj.handle
+    obj_map[self] = new_obj
+    resource_map[self.handle] = new_obj.handle
+    resource_map[self] = new_obj.handle
+    return obj_map, resource_map
+
 
 class _DistributedVariableSaveable(saveable_object.SaveableObject):
   """Class for defining how to restore a DistributedVariable."""
@@ -822,6 +878,12 @@ class _MirroredSaveable(saveable_object_util.ResourceVariableSaveable):
   def restore(self, restored_tensors, restored_shapes):
     """Restore the same value into all variables."""
     tensor, = restored_tensors
+    packed_var = self._mirrored_variable._packed_variable  # pylint: disable=protected-access
+    if packed_var is not None:
+      return control_flow_ops.group(
+          tuple(
+              values_util.assign_on_device(d, packed_var, tensor)
+              for d in packed_var.devices))
     return control_flow_ops.group(
         tuple(
             values_util.assign_on_device(v.device, v, tensor)
@@ -1000,7 +1062,7 @@ class SyncOnReadVariable(DistributedVariable):
 
   def _get_cross_replica(self):
     if self._aggregation == vs.VariableAggregation.ONLY_FIRST_REPLICA:
-      return self._primary
+      return self._get_replica(0)
 
     with ds_context.enter_or_assert_strategy(self._distribute_strategy):
       return self._distribute_strategy.reduce(
diff --git a/tensorflow/python/distribute/values_test.py b/tensorflow/python/distribute/values_test.py
index 0cb4d6ddd2a..d0e3eec22a8 100644
--- a/tensorflow/python/distribute/values_test.py
+++ b/tensorflow/python/distribute/values_test.py
@@ -42,7 +42,6 @@ from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.eager import test
 from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import device
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import indexed_slices
 from tensorflow.python.framework import ops
@@ -234,11 +233,11 @@ class DistributedValuesTest(test.TestCase, parameterized.TestCase):
           distribution=[
               strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
               strategy_combinations.tpu_strategy,
+              strategy_combinations.tpu_strategy_packed_var,
               # TODO(b/137795644): support CentralStroageStrategy
               # strategy_combinations.central_storage_strategy_with_two_gpus,
           ],
-          mode=["eager"]
-      ))
+          mode=["eager"]))
   def testMakeDistributedValueDefaultDevicePlacement(self, distribution):
     if not tf2.enabled():
       self.skipTest("Only V2 is supported.")
@@ -259,11 +258,11 @@ class DistributedValuesTest(test.TestCase, parameterized.TestCase):
           distribution=[
               strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
               strategy_combinations.tpu_strategy,
+              strategy_combinations.tpu_strategy_packed_var,
               # TODO(b/137795644): support CentralStroageStrategy
               # strategy_combinations.central_storage_strategy_with_two_gpus,
           ],
-          mode=["eager"]
-      ))
+          mode=["eager"]))
   def testMakeDistributedValueExplicitDevicePlacement(self, distribution):
     if not tf2.enabled():
       self.skipTest("Only V2 is supported.")
@@ -384,6 +383,16 @@ def _make_mirrored():
   return mirrored
 
 
+def mirrored_and_tpu_strategy_combinations():
+  return combinations.combine(
+      distribution=[
+          strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
+          strategy_combinations.tpu_strategy,
+          strategy_combinations.tpu_strategy_packed_var,
+      ],
+      mode=["graph", "eager"])
+
+
 class RegroupAndSelectDeviceTest(test.TestCase, parameterized.TestCase):
 
   def _is_per_replica(self, result, expected, klass=values.PerReplica):
@@ -563,6 +572,7 @@ class RegroupAndSelectDeviceTest(test.TestCase, parameterized.TestCase):
             strategy_combinations.mirrored_strategy_with_one_cpu,
             strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
             strategy_combinations.tpu_strategy,
+            strategy_combinations.tpu_strategy_packed_var,
             strategy_combinations.central_storage_strategy_with_two_gpus,
         ],
         synchronization=[
@@ -708,29 +718,40 @@ class DistributedVariableTest(test.TestCase, parameterized.TestCase):
         self.evaluate(
             distribution.experimental_local_results(distribution.run(assign)))
 
-  def testPackedVariable(self, distribution, synchronization, aggregation):
+
+@combinations.generate(
+    combinations.combine(
+        distribution=[
+            strategy_combinations.mirrored_strategy_with_one_cpu,
+            strategy_combinations.tpu_strategy,
+        ],
+        mode=["eager"]))
+class PackedDistributedVariableTest(test.TestCase, parameterized.TestCase):
+
+  def testPackedVariable(self, distribution):
     with distribution.scope():
-      v0 = variables_lib.Variable(
-          0., synchronization=synchronization, aggregation=aggregation)
-      if not isinstance(v0, values.DistributedVariable):
-        self.skipTest("This test doesn't apply to non DistributedVariables")
-
-    self.assertEqual(v0._packed_var, None)
-
-    device_type = device.DeviceSpec.from_string(v0._devices[0]).device_type
-    for d in v0._devices:
-      if device.DeviceSpec.from_string(d).device_type != device_type:
-        self.skipTest("Packing variables on devices of different types "
-                      "is not supported yet.")
+      v0 = variables_lib.Variable(0.)
+    self.assertIsNone(v0._packed_var)
 
     distribution._enable_packed_variable_in_eager_mode = True
     with distribution.scope():
-      v1 = variables_lib.Variable(
-          0., synchronization=synchronization, aggregation=aggregation)
-    if ops.executing_eagerly_outside_functions():
+      v1 = variables_lib.Variable(0)
       self.assertIsInstance(v1._packed_var, packed.PackedDistributedVariable)
-    else:
-      self.assertEqual(v1._packed_var, None)
+
+    devices = v1._devices
+    for i in range(1, len(devices)):
+      with distribute_lib.ReplicaContext(distribution, i):
+        v1.assign(i)
+    val = v1._get()
+    self.assertIsInstance(val, packed.PackedVarAndDevice)
+    self.assertEqual(val.device, devices[0])
+    self.assertEqual(self.evaluate(val.read_value()), 0)
+    for i in range(0, len(devices)):
+      with distribute_lib.ReplicaContext(distribution, i):
+        val = v1._get()
+        self.assertIsInstance(val, packed.PackedVarAndDevice)
+        self.assertEqual(val.device, devices[i])
+        self.assertEqual(self.evaluate(val.read_value()), i)
 
 
 class MirroredVariableTest(test.TestCase, parameterized.TestCase):
@@ -920,6 +941,7 @@ class MirroredVariableTest(test.TestCase, parameterized.TestCase):
           distribution=[
               strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
               strategy_combinations.tpu_strategy,
+              strategy_combinations.tpu_strategy_packed_var,
           ],
           mode=["eager"]))
   def testAssignValueInReplicaContextWithoutAggregation(self, distribution):
@@ -943,6 +965,7 @@ class MirroredVariableTest(test.TestCase, parameterized.TestCase):
               strategy_combinations.mirrored_strategy_with_one_cpu,
               strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
               strategy_combinations.tpu_strategy,
+              strategy_combinations.tpu_strategy_packed_var,
           ],
           mode=["graph", "eager"]))
   def testValueInReplicaContext(self, distribution):
@@ -968,6 +991,7 @@ class MirroredVariableTest(test.TestCase, parameterized.TestCase):
               strategy_combinations.mirrored_strategy_with_one_cpu,
               strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
               strategy_combinations.tpu_strategy,
+              strategy_combinations.tpu_strategy_packed_var,
           ],
           mode=["graph", "eager"]))
   def testAssignOutOfScope(self, distribution):
@@ -1041,6 +1065,7 @@ class MirroredVariableTest(test.TestCase, parameterized.TestCase):
           distribution=[
               strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
               strategy_combinations.tpu_strategy,
+              strategy_combinations.tpu_strategy_packed_var,
           ],
           mode=["eager"]))
   def testInitializedToSameValueInsideEagerRun(self, distribution):
@@ -1066,6 +1091,7 @@ class MirroredVariableTest(test.TestCase, parameterized.TestCase):
               strategy_combinations.mirrored_strategy_with_one_cpu,
               strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
               strategy_combinations.tpu_strategy,
+              strategy_combinations.tpu_strategy_packed_var,
           ],
           mode=["graph", "eager"]))
   def testAggregationOnlyFirstReplica(self, distribution):
@@ -1093,6 +1119,7 @@ class MirroredVariableTest(test.TestCase, parameterized.TestCase):
           distribution=[
               strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
               strategy_combinations.tpu_strategy,
+              strategy_combinations.tpu_strategy_packed_var,
           ],
           mode=["eager"]))
   def testInitScope(self, distribution):
@@ -1143,13 +1170,7 @@ class MirroredVariableTest(test.TestCase, parameterized.TestCase):
         distribution.experimental_local_results(distribution.run(add)))
     self.assertAllEqual([2, 2], per_replica_results)
 
-  @combinations.generate(
-      combinations.combine(
-          distribution=[
-              strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
-              strategy_combinations.tpu_strategy,
-          ],
-          mode=["graph", "eager"]))
+  @combinations.generate(mirrored_and_tpu_strategy_combinations())
   def testAssignAdd(self, distribution):
     with distribution.scope():
       v = variable_scope.variable(
@@ -1456,15 +1477,6 @@ class SyncOnReadVariablePropertiesTest(test.TestCase):
     self.assertEqual(2., self.evaluate(add1(replica_local)))
 
 
-def mirrored_and_tpu_strategy_combinations():
-  return combinations.combine(
-      distribution=[
-          strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
-          strategy_combinations.tpu_strategy,
-      ],
-      mode=["graph", "eager"])
-
-
 # TODO(b/144432582): Add variable aggregation type to combinations to simplify
 # tests.
 def strategy_and_run_tf_function_combinations():
@@ -1478,6 +1490,7 @@ def strategy_and_run_tf_function_combinations():
       experimental_run_tf_function=[True, False]) + combinations.combine(
           distribution=[
               strategy_combinations.tpu_strategy,
+              strategy_combinations.tpu_strategy_packed_var,
           ],
           mode=["graph", "eager"],
           experimental_run_tf_function=[True])
diff --git a/tensorflow/python/distribute/values_util.py b/tensorflow/python/distribute/values_util.py
index ddb0d2d0401..5909bdd229e 100644
--- a/tensorflow/python/distribute/values_util.py
+++ b/tensorflow/python/distribute/values_util.py
@@ -61,8 +61,14 @@ def on_write_assign_sub(var, value, use_locking=False, name=None,
 
 
 def assign_on_each_device(var, assign_func, value, read_value):
-  update = control_flow_ops.group(
-      tuple(assign_func(v.device, v, value) for v in var._values))  # pylint: disable=protected-access
+  """Update the variable on each replica with the given assign_func and value."""
+  if var._packed_variable is not None:  # pylint: disable=protected-access
+    update = control_flow_ops.group(
+        tuple(
+            assign_func(d, var._packed_variable, value) for d in var._devices))  # pylint: disable=protected-access
+  else:
+    update = control_flow_ops.group(
+        tuple(assign_func(v.device, v, value) for v in var._values))  # pylint: disable=protected-access
   if not read_value:
     return update
   with ops.control_dependencies([update] if update else []):
@@ -104,7 +110,7 @@ def on_read_assign_cross_replica(var, value, read_value=True):
       # TODO(anjs): Should this be over all the replicas in sync since we
       # call `reduce` on the variable during read?
       if var.aggregation == vs.VariableAggregation.SUM:
-        tensor = math_ops.cast(tensor / len(var._values), var.dtype)  # pylint: disable=protected-access
+        tensor = math_ops.cast(tensor / len(var._devices), var.dtype)  # pylint: disable=protected-access
       return assign_on_each_device(var, assign_on_device, tensor,
                                    read_value)
 
diff --git a/tensorflow/python/eager/BUILD b/tensorflow/python/eager/BUILD
index af5f3d16408..408d784ae82 100644
--- a/tensorflow/python/eager/BUILD
+++ b/tensorflow/python/eager/BUILD
@@ -560,6 +560,7 @@ py_library(
         "//tensorflow/python:graph_to_function_def",
         "//tensorflow/python:pywrap_tf_session",
         "//tensorflow/python:util",
+        "//tensorflow/python/ops/numpy_ops:numpy",
         "//third_party/py/numpy",
         "@six_archive//:six",
     ],
@@ -587,6 +588,7 @@ py_library(
         "//tensorflow/python:tensor_shape",
         "//tensorflow/python:unconnected_gradients",
         "//tensorflow/python:util",
+        "//tensorflow/python/ops/numpy_ops:numpy",
         "//tensorflow/python/ops/parallel_for:control_flow_ops",
         "@six_archive//:six",
     ],
diff --git a/tensorflow/python/eager/backprop.py b/tensorflow/python/eager/backprop.py
index dc7bb7c4b11..5800a51f89a 100644
--- a/tensorflow/python/eager/backprop.py
+++ b/tensorflow/python/eager/backprop.py
@@ -14,6 +14,9 @@
 # ==============================================================================
 """Code for backpropagation using the tape utilities."""
 
+# TODO(b/159343581): Properly support CompositeTensor in all functions in this
+# file.
+
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -59,6 +62,9 @@ from tensorflow.python.util.tf_export import tf_export
 pfor_ops = LazyLoader(
     "pfor_ops", globals(),
     "tensorflow.python.ops.parallel_for.control_flow_ops")
+np_arrays = LazyLoader(
+    "np_arrays", globals(),
+    "tensorflow.python.ops.numpy_ops.np_arrays")
 
 function = LazyLoader("function", globals(),
                       "tensorflow.python.eager.function")
@@ -718,9 +724,11 @@ pywrap_tfe.TFE_Py_RegisterVSpace(_default_vspace)
 
 
 def _handle_or_self(x):
-  """If x is ResourceVariable, return its handle, else x."""
+  """Unwrap resource variable/ndarray to return tensors."""
   if resource_variable_ops.is_resource_variable(x):
-    x = x.handle
+    return x.handle
+  if isinstance(x, np_arrays.ndarray):
+    return x.data
   return x
 
 
@@ -1020,6 +1028,7 @@ class GradientTape(object):
             "gradient in order to compute higher order "
             "derivatives.", 1)
 
+    num_ndarrays = 0
     flat_targets = []
     for t in nest.flatten(target):
       if not backprop_util.IsTrainable(t):
@@ -1030,7 +1039,12 @@ class GradientTape(object):
       if resource_variable_ops.is_resource_variable(t):
         with self:
           t = ops.convert_to_tensor(t)
+      elif isinstance(t, np_arrays.ndarray):
+        t = t.data
+        num_ndarrays += 1
       flat_targets.append(t)
+    # Only rewrap if all targets are ndarray. If not, prefer tensors.
+    rewrap_as_ndarray = num_ndarrays == len(flat_targets)
 
     flat_sources = nest.flatten(sources)
     flat_sources_raw = flat_sources
@@ -1063,6 +1077,9 @@ class GradientTape(object):
       self._watched_variables = self._tape.watched_variables()
       self._tape = None
 
+    if rewrap_as_ndarray:
+      flat_grad = nest.map_structure(np_arrays.tensor_to_ndarray, flat_grad)
+
     grad = nest.pack_sequence_as(sources, flat_grad)
     return grad
 
@@ -1117,6 +1134,10 @@ class GradientTape(object):
       ValueError: If vectorization of jacobian computation fails.
     """
     flat_sources = nest.flatten(sources)
+    rewrap_as_ndarray = False
+    if isinstance(target, np_arrays.ndarray):
+      target = target.data
+      rewrap_as_ndarray = True
     target_static_shape = target.shape
     target_shape = array_ops.shape(target)
     # Note that we push and pop the tape here and below. This is needed since we
@@ -1166,6 +1187,8 @@ class GradientTape(object):
         out = array_ops.reshape(out, new_shape)
         if context.executing_eagerly():
           out.set_shape(target_static_shape.concatenate(flat_sources[i].shape))
+      if rewrap_as_ndarray:
+        out = np_arrays.tensor_to_ndarray(out)
       output[i] = out
 
     return nest.pack_sequence_as(sources, output)
diff --git a/tensorflow/python/eager/backprop_test.py b/tensorflow/python/eager/backprop_test.py
index a0f98fc0a44..20c6e05adda 100644
--- a/tensorflow/python/eager/backprop_test.py
+++ b/tensorflow/python/eager/backprop_test.py
@@ -84,8 +84,9 @@ class BackpropTest(test.TestCase, parameterized.TestCase):
       tf_y = tf_g1 * tf_g2 * tf_g3
       tf_grad = gradients.gradients(tf_y, [tf_var])[0]
 
-      tf_dense_grad = math_ops.unsorted_segment_sum(
-          tf_grad.values, tf_grad.indices, tf_grad.dense_shape[0])
+      tf_dense_grad = math_ops.unsorted_segment_sum(tf_grad.values,
+                                                    tf_grad.indices,
+                                                    tf_grad.dense_shape[0])
 
       self.assertAllClose(grad, self.evaluate(tf_dense_grad))
 
@@ -127,9 +128,8 @@ class BackpropTest(test.TestCase, parameterized.TestCase):
     self.assertAllEqual(grads_and_vars[0][0], 1.0)
     self.assertAllEqual(id(grads_and_vars[0][1]), id(x))
 
-  @parameterized.named_parameters(
-      [('Function', def_function.function),
-       ('NoFunction', lambda f: f)])
+  @parameterized.named_parameters([('Function', def_function.function),
+                                   ('NoFunction', lambda f: f)])
   def testNoOpBehaviorConsistent(self, decorator):
 
     @decorator
@@ -195,8 +195,10 @@ class BackpropTest(test.TestCase, parameterized.TestCase):
 
     @custom_gradient.custom_gradient
     def identity(x):
+
       def grad(_):
         return []  # This return value is wrong!
+
       return x, grad
 
     x = variables.Variable(1.0)
@@ -234,8 +236,10 @@ class BackpropTest(test.TestCase, parameterized.TestCase):
 
     @custom_gradient.custom_gradient
     def f(x):
+
       def grad(_):
         raise RuntimeError('x')
+
       return x, grad
 
     # TODO(apassos) raise the right error here
@@ -337,7 +341,7 @@ class BackpropTest(test.TestCase, parameterized.TestCase):
     x = constant_op.constant(2.0)
     with backprop.GradientTape() as t:
       t.watch(x)
-      y = x*x
+      y = x * x
     self.assertEqual(t.gradient([x, y], x).numpy(), 5.0)
 
   def testTapeNoOpGradientWithMultiTargetAllSource(self):
@@ -441,9 +445,11 @@ class BackpropTest(test.TestCase, parameterized.TestCase):
     self.assertAllEqual(t.gradient(loss, v), 2.0)
 
   def testPythonMax(self):
-    x = [resource_variable_ops.ResourceVariable(2.),
-         resource_variable_ops.ResourceVariable(3.),
-         resource_variable_ops.ResourceVariable(5.)]
+    x = [
+        resource_variable_ops.ResourceVariable(2.),
+        resource_variable_ops.ResourceVariable(3.),
+        resource_variable_ops.ResourceVariable(5.)
+    ]
     with backprop.GradientTape() as t:
       f = max(x)
     grad = t.gradient(f, x)
@@ -538,8 +544,8 @@ class BackpropTest(test.TestCase, parameterized.TestCase):
       with backprop.GradientTape() as tape2:
         tape1.watch(x1)
         tape2.watch([x1, x2])
-        y = x1 ** 3
-        z = x2 ** 2
+        y = x1**3
+        z = x2**2
         dy, dz = tape2.gradient([y, z], [x1, x2])
       d2y, d2z = tape1.gradient([dy, dz], [x1, x2])
 
@@ -602,6 +608,7 @@ class BackpropTest(test.TestCase, parameterized.TestCase):
 
   @test_util.assert_no_new_tensors
   def testArgmax(self):
+
     def argmax(x):
       i = math_ops.argmax(x)
       return array_ops.stop_gradient(i)
@@ -612,6 +619,7 @@ class BackpropTest(test.TestCase, parameterized.TestCase):
   @test_util.run_gpu_only
   @test_util.assert_no_new_tensors
   def testGPU(self):
+
     def fn(x):
       with context.device('/gpu:0'):
         b = constant_op.constant(2.0)
@@ -634,8 +642,7 @@ class BackpropTest(test.TestCase, parameterized.TestCase):
       with context.device('gpu:0'):
         return v.read_value()
 
-    self.assertEqual(
-        backprop.implicit_grad(f)()[0][0].cpu().numpy(), 1.0)
+    self.assertEqual(backprop.implicit_grad(f)()[0][0].cpu().numpy(), 1.0)
 
   @test_util.assert_no_new_tensors
   def testCPU(self):
@@ -651,6 +658,7 @@ class BackpropTest(test.TestCase, parameterized.TestCase):
   @test_util.run_gpu_only
   @test_util.assert_no_new_tensors
   def testTensorCopyGPU2CPU2GPU(self):
+
     def f(a, b):
       return a.cpu() + b.cpu()
 
@@ -675,8 +683,7 @@ class BackpropTest(test.TestCase, parameterized.TestCase):
 
   @test_util.assert_no_new_tensors
   def testUnconnectedNone(self):
-    v = resource_variable_ops.ResourceVariable(
-        1.0, name='testUnconnectedNone')
+    v = resource_variable_ops.ResourceVariable(1.0, name='testUnconnectedNone')
 
     def f():
       v.read_value()
@@ -690,9 +697,9 @@ class BackpropTest(test.TestCase, parameterized.TestCase):
     with g:
       x = constant_op.constant(3.0)
       g.watch(x)
-      y = 2*x
+      y = 2 * x
     with g:
-      z = 2*y
+      z = 2 * y
     grad = g.gradient(target=z, sources=[x])
     self.assertEqual(self.evaluate(grad), [4.0])
 
@@ -732,16 +739,24 @@ class BackpropTest(test.TestCase, parameterized.TestCase):
       g.watch(x1)
       g.watch(x2)
       g.watch(x3)
-      y = x1  + 2 * x2  + 3 * x3
+      y = x1 + 2 * x2 + 3 * x3
     self.assertEqual(self.evaluate(g.gradient(y, x1)), [1.0])
     self.assertEqual(self.evaluate(g.gradient(y, (x1,))), (1.0,))
     self.assertEqual(self.evaluate(g.gradient(y, (x1, x2))), (1.0, 2.0))
-    self.assertEqual(self.evaluate(g.gradient(y, [(x1, x2), (x2, x3)])),
-                     [(1.0, 2.0), (2.0, 3.0)])
-    self.assertEqual(self.evaluate(g.gradient(y, (x1, x2, [x1, x3]))),
-                     (1.0, 2.0, [1.0, 3.0]))
-    self.assertEqual(self.evaluate(g.gradient(y, [x1, {'x2': x2, 'x3': x3}])),
-                     [1.0, {'x2': 2.0, 'x3': 3.0}])
+    self.assertEqual(
+        self.evaluate(g.gradient(y, [(x1, x2), (x2, x3)])), [(1.0, 2.0),
+                                                             (2.0, 3.0)])
+    self.assertEqual(
+        self.evaluate(g.gradient(y, (x1, x2, [x1, x3]))),
+        (1.0, 2.0, [1.0, 3.0]))
+    self.assertEqual(
+        self.evaluate(g.gradient(y, [x1, {
+            'x2': x2,
+            'x3': x3
+        }])), [1.0, {
+            'x2': 2.0,
+            'x3': 3.0
+        }])
 
   @test_util.assert_no_new_tensors
   @test_util.run_in_graph_and_eager_modes
@@ -846,13 +861,13 @@ class BackpropTest(test.TestCase, parameterized.TestCase):
     with backprop.GradientTape(persistent=True) as g:
       x = constant_op.constant(3.0)
       g.watch(x)
-      y = x ** 3                      # y       := x^3
-      dy_dx = g.gradient(y, x)        # dy/dx   := 3x^2
+      y = x**3  # y       := x^3
+      dy_dx = g.gradient(y, x)  # dy/dx   := 3x^2
       d2y_dx2 = g.gradient(dy_dx, x)  # d2y/dx2 := 6x
     d3y_dx3 = g.gradient(d2y_dx2, x)  # d3y/dx3 := 6
     x = 3
-    self.assertEqual(self.evaluate(y), x ** 3)
-    self.assertEqual(self.evaluate(dy_dx), 3 * x ** 2)
+    self.assertEqual(self.evaluate(y), x**3)
+    self.assertEqual(self.evaluate(dy_dx), 3 * x**2)
     self.assertEqual(self.evaluate(d2y_dx2), 6 * x)
     self.assertEqual(self.evaluate(d3y_dx3), 6)
     del g
@@ -973,19 +988,17 @@ class BackpropTest(test.TestCase, parameterized.TestCase):
     x = constant_op.constant(1.)
     with backprop.GradientTape() as g:
       g.watch(x)
-      tape_lib.record_operation(
-          'InvalidBackprop',
-          [y],
-          [x],
-          lambda dy: [])
-    with self.assertRaisesRegexp(
-        errors_impl.InternalError, 'InvalidBackprop.*too few gradients'):
+      tape_lib.record_operation('InvalidBackprop', [y], [x], lambda dy: [])
+    with self.assertRaisesRegexp(errors_impl.InternalError,
+                                 'InvalidBackprop.*too few gradients'):
       g.gradient(y, x)
 
   @test_util.assert_no_new_tensors
   def testEmptyParamsForValueAndGradFunction(self):
+
     def fn(a, b):
       return a * b
+
     val_and_grads_fn = backprop.val_and_grad_function(fn)
 
     x = 2.0
@@ -997,8 +1010,10 @@ class BackpropTest(test.TestCase, parameterized.TestCase):
 
   @test_util.assert_no_new_tensors
   def testNonEmptyParamsForValueAndGradFunction(self):
+
     def fn(a, b):
       return a * b
+
     val_and_grad_fn = backprop.val_and_grad_function(fn, params=[1])
 
     x = 2.0
@@ -1046,9 +1061,7 @@ class BackpropTest(test.TestCase, parameterized.TestCase):
     def mul(x):
       return math_ops._mul_dispatch(x, x)  # pylint: disable=protected-access
 
-    self.assertAllEqual(
-        backprop.gradients_function(mul)(3.0)[0].numpy(),
-        6.0)
+    self.assertAllEqual(backprop.gradients_function(mul)(3.0)[0].numpy(), 6.0)
 
   def testMakeAttrShape(self):
     for s in ([], None, [1, 2, 3], [None, None], [1, None, 3]):
@@ -1057,8 +1070,8 @@ class BackpropTest(test.TestCase, parameterized.TestCase):
       self.assertEqual(
           expected,
           actual,
-          msg=('For shape %r, expected %r != %r actual' % (s, expected,
-                                                           actual)))
+          msg=('For shape %r, expected %r != %r actual' %
+               (s, expected, actual)))
 
   def testMakeAttrShapeList(self):
     shape_list = [[], None, [1, 2, 3], [None, None], [1, None, 3]]
@@ -1081,8 +1094,7 @@ class BackpropTest(test.TestCase, parameterized.TestCase):
 
     part = functools.partial(f, constant_op.constant(2.0))
     self.assertAllEqual(
-        backprop.gradients_function(part)(constant_op.constant(1.0))[0],
-        2.0)
+        backprop.gradients_function(part)(constant_op.constant(1.0))[0], 2.0)
 
   def testReturnSameThing(self):
 
@@ -1238,10 +1250,11 @@ class BackpropTest(test.TestCase, parameterized.TestCase):
 
     @custom_gradient.custom_gradient
     def my_mul(x, y):
-      result = x*y
+      result = x * y
 
       def grad(dr):
-        return [dr*y, dr*x]
+        return [dr * y, dr * x]
+
       return result, grad
 
     lr = 0.25
@@ -1257,7 +1270,7 @@ class BackpropTest(test.TestCase, parameterized.TestCase):
       loss, grads_and_vars = loss_grads_fn(x)
       losses.append(loss.numpy())
       for (grad, var) in grads_and_vars:
-        var.assign_sub(lr*grad)
+        var.assign_sub(lr * grad)
     self.assertAllEqual(losses, [4.0, 3., 2., 1., 0.])
 
   @test_util.assert_no_new_tensors
@@ -1276,7 +1289,7 @@ class BackpropTest(test.TestCase, parameterized.TestCase):
   def testDifferentiatingFunctionThatReturnsNone(self):
 
     def fn(x, y):
-      result = x*y  # pylint: disable=unused-variable
+      result = x * y  # pylint: disable=unused-variable
 
     x = constant_op.constant(1)
     y = constant_op.constant(2)
@@ -1295,6 +1308,7 @@ class BackpropTest(test.TestCase, parameterized.TestCase):
 
   def testZerosCacheDoesntLeakAcrossGraphs(self):
     with ops.Graph().as_default():
+
       def get_grad():
         with ops.Graph().as_default(), self.cached_session():
           t = constant_op.constant(1, dtype=dtypes.float32, shape=(10, 4))
@@ -1378,6 +1392,7 @@ class BackpropTest(test.TestCase, parameterized.TestCase):
 
   @test_util.run_in_graph_and_eager_modes
   def testCustomGradientInEagerAndGraph(self):
+
     @custom_gradient.custom_gradient
     def f(x):
       y = x * x
@@ -1394,10 +1409,12 @@ class BackpropTest(test.TestCase, parameterized.TestCase):
     self.assertAllEqual(self.evaluate(t.gradient(g, c)), 4.0)
 
   def testOverrideSecondOrderWithCustomGradient(self):
+
     @custom_gradient.custom_gradient
     def f(x):
 
       def first_order_grad(dz):
+
         @custom_gradient.custom_gradient
         def first_order_custom(unused_x):
 
@@ -1405,6 +1422,7 @@ class BackpropTest(test.TestCase, parameterized.TestCase):
             return -2.1 * ddz
 
           return -1.1, h
+
         return dz * first_order_custom(x)
 
       return x + 10., first_order_grad
@@ -1414,25 +1432,31 @@ class BackpropTest(test.TestCase, parameterized.TestCase):
       outer.watch(c)
       with backprop.GradientTape() as inner:
         inner.watch(c)
-        d = f(c) ** 4.
+        d = f(c)**4.
       dd = inner.gradient(d, c)
-      self.assertAllClose(4. * f(c) ** 3. * -1.1, dd)
-    self.assertAllClose(3. * 4. * f(c) ** 2. * -1.1 * -1.1
-                        + 4. * f(c) ** 3. * -2.1,
+      self.assertAllClose(4. * f(c)**3. * -1.1, dd)
+    self.assertAllClose(3. * 4. * f(c)**2. * -1.1 * -1.1 + 4. * f(c)**3. * -2.1,
                         outer.gradient(dd, c))
 
   @test_util.run_in_graph_and_eager_modes
   def testCustomGradientForwardprop(self):
+
     @custom_gradient.custom_gradient
     def f(x):
       z = 2. * tensor_util.constant_value(x)
+
       def g(dz):
+
         @custom_gradient.custom_gradient
         def first_order(unused_x, unused_dz):
+
           def second_order_and_transpose(unused_ddz):
             return 2.2, 3.1
+
           return 2.1, second_order_and_transpose
+
         return first_order(x, dz)
+
       return z, g
 
     with backprop.GradientTape(persistent=True) as t:
@@ -1457,9 +1481,6 @@ class BackpropTest(test.TestCase, parameterized.TestCase):
   @test_util.run_in_graph_and_eager_modes
   def testMaxPooling3DGradient(self):
 
-    if test.is_built_with_rocm():
-      self.skipTest('Pooling with 3D tensors is not supported in ROCm')
-
     def forward(a):
       r = max_pooling3d(a, pool_size=pool_size, strides=strides, padding='SAME')
       return r
@@ -1491,9 +1512,7 @@ class BackpropTest(test.TestCase, parameterized.TestCase):
     with backprop.GradientTape() as t:
       values = constant_op.constant([1.0, 2.0], dtypes.float32)
       s = sparse_tensor.SparseTensor(
-          indices=[[0, 0], [1, 2]],
-          values=values,
-          dense_shape=[3, 4])
+          indices=[[0, 0], [1, 2]], values=values, dense_shape=[3, 4])
       t.watch(s)
       z = sparse_ops.sparse_reduce_sum_v2(s)
     result = t.gradient(z, values)
@@ -1529,6 +1548,7 @@ class BackpropTest(test.TestCase, parameterized.TestCase):
     self.assertEqual((z,), tape.watched_variables())
 
   def testNameScope(self):
+
     def fn(x):
       with ops.name_scope('my_scope'):
         a = math_ops.cos(x)
@@ -1592,8 +1612,8 @@ class JacobianTest(test.TestCase):
       g.watch(x)
       g.watch(y)
       z = x * x * y
-    jacobian = g.jacobian(z, [x, y],
-                          experimental_use_pfor=experimental_use_pfor)
+    jacobian = g.jacobian(
+        z, [x, y], experimental_use_pfor=experimental_use_pfor)
     answer = [array_ops.diag(2 * x * y), array_ops.diag(x * x)]
     return jacobian, answer
 
@@ -1648,8 +1668,9 @@ class JacobianTest(test.TestCase):
       x = constant_op.constant([[1., 2], [3, 4]])
       g.watch(x)
       y = math_ops.matmul(x, x)
-    self.assertAllClose(g.jacobian(y, x, parallel_iterations=2),
-                        g.jacobian(y, x, parallel_iterations=3))
+    self.assertAllClose(
+        g.jacobian(y, x, parallel_iterations=2),
+        g.jacobian(y, x, parallel_iterations=3))
 
   @test_util.run_in_graph_and_eager_modes
   def test_nested_jacobian(self):
@@ -1690,8 +1711,9 @@ class BatchJacobianTest(test.TestCase, parameterized.TestCase):
       z = x * x * y
     batch_jacobian = g.batch_jacobian(
         z, x, experimental_use_pfor=experimental_use_pfor)
-    answer = array_ops.stack([array_ops.diag(2 * x[0] * y[0]),
-                              array_ops.diag(2 * x[1] * y[1])])
+    answer = array_ops.stack(
+        [array_ops.diag(2 * x[0] * y[0]),
+         array_ops.diag(2 * x[1] * y[1])])
     return batch_jacobian, answer
 
   def testPfor(self):
@@ -1757,14 +1779,12 @@ class BatchJacobianTest(test.TestCase, parameterized.TestCase):
       g.watch(x)
       w = constant_op.constant([[1., 2, 3, 4], [5, 6, 7, 8]])
       y = math_ops.matmul(x, w)
-    self.assertAllClose(g.batch_jacobian(y, x, parallel_iterations=2),
-                        g.batch_jacobian(y, x, parallel_iterations=3))
+    self.assertAllClose(
+        g.batch_jacobian(y, x, parallel_iterations=2),
+        g.batch_jacobian(y, x, parallel_iterations=3))
 
-  @parameterized.parameters(
-      (True, True),
-      (True, False),
-      (False, True),
-      (False, False))
+  @parameterized.parameters((True, True), (True, False), (False, True),
+                            (False, False))
   def test_degenerate_shape(self, use_function, use_pfor):
 
     def f(x):
diff --git a/tensorflow/python/eager/benchmarks_test.py b/tensorflow/python/eager/benchmarks_test.py
index b7c8395790a..24e86c77a14 100644
--- a/tensorflow/python/eager/benchmarks_test.py
+++ b/tensorflow/python/eager/benchmarks_test.py
@@ -50,6 +50,7 @@ from tensorflow.python.eager import test
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
@@ -1441,6 +1442,19 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
 
     self._run(fn, 10000)
 
+  def benchmark_tf_tensor_shape_creation_overhead(self):
+    # A `TensorShape` is created the first time `EagerTensor.shape` is
+    # called, which puts `TensorShape.__init__` on the hotpath. The
+    # `TensorShape` is created from `EagerTensor._shape_tuple`.
+
+    x = array_ops.ones((1, 1))
+    shape_tuple = x._shape_tuple()
+
+    def fn():
+      tensor_shape.TensorShape(shape_tuple)
+
+    self._run(fn, 100000)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/eager/def_function_test.py b/tensorflow/python/eager/def_function_test.py
index 0549da2c256..6dc4e322bbd 100644
--- a/tensorflow/python/eager/def_function_test.py
+++ b/tensorflow/python/eager/def_function_test.py
@@ -568,10 +568,10 @@ class DefFunctionTest(test.TestCase, parameterized.TestCase):
     self.assertEqual(trace_count[0], 1)
     self.assertEqual(self.evaluate(v1), 2.0)
     double_variable(v2)
-    self.assertEqual(trace_count[0], 1 if ops.Tensor._USE_EQUALITY else 2)
+    self.assertEqual(trace_count[0], 2)
     self.assertEqual(self.evaluate(v2), 4.0)
     double_variable(v3)
-    self.assertEqual(trace_count[0], 2 if ops.Tensor._USE_EQUALITY else 3)
+    self.assertEqual(trace_count[0], 3)
     self.assertEqual(self.evaluate(v3), 8)
 
   def testShapeCache(self):
diff --git a/tensorflow/python/eager/def_function_xla_jit_test.py b/tensorflow/python/eager/def_function_xla_jit_test.py
index b63a3b434d4..78d44a81b0b 100644
--- a/tensorflow/python/eager/def_function_xla_jit_test.py
+++ b/tensorflow/python/eager/def_function_xla_jit_test.py
@@ -29,6 +29,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import control_flow_util
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.platform import test
@@ -385,6 +386,24 @@ class DefFunctionTest(test.TestCase):
     f64_input = constant_op.constant([1.1, 2.2, 3.3], dtype=dtypes.float64)
     self.assertAllClose([1.1, 3.3, 6.6], f(f64_input))
 
+  def testNoExcessiveRetracing(self):
+    inner_retracings = 0
+
+    @def_function.function(experimental_compile=True)
+    def inner(a, b):
+      nonlocal inner_retracings
+      inner_retracings += 1
+      return a * b + a
+
+    def outer(a, b):
+      return inner(a, b)
+
+    func_input = random_ops.random_normal([10, 10])
+    for _ in range(2):
+      def_function.function(outer)(func_input, func_input)
+
+    self.assertEqual(inner_retracings, 1)
+
 
 if __name__ == '__main__':
   ops.enable_eager_execution()
diff --git a/tensorflow/python/eager/function.py b/tensorflow/python/eager/function.py
index a40eaf886b3..e2f5d86fbd2 100644
--- a/tensorflow/python/eager/function.py
+++ b/tensorflow/python/eager/function.py
@@ -81,6 +81,9 @@ from tensorflow.python.util import tf_inspect
 ag_ctx = lazy_loader.LazyLoader(
     "ag_ctx", globals(),
     "tensorflow.python.autograph.core.ag_ctx")
+np_arrays = lazy_loader.LazyLoader(
+    "np_arrays", globals(),
+    "tensorflow.python.ops.numpy_ops.np_arrays")
 
 
 FORWARD_FUNCTION_ATTRIBUTE_NAME = "forward_function_name"
@@ -89,7 +92,7 @@ IMPLEMENTS_ATTRIBUTE_NAME = "_implements"
 SHARED_RENDEZVOUS_ATTRIBUTE_NAME = "shared_rendezvous"
 
 
-def _make_input_signature_hashable(elem, variable_map=None):
+def _make_input_signature_hashable(elem):
   """Rewrite input signature to be hashable.
 
   We replace nested variables in the input signature with TensorSpec in order to
@@ -97,18 +100,13 @@ def _make_input_signature_hashable(elem, variable_map=None):
 
   Args:
     elem: Input signature element
-    variable_map: Internal argument used for tracking variable aliases
 
   Returns:
     A hashable object for the requested input signature
   """
-  if variable_map is None:
-    variable_map = {}
-
   # TODO(slebedev): consider using nest.
   if isinstance(elem, tuple):
-    return tuple(map(lambda e: _make_input_signature_hashable(e, variable_map),
-                     elem))
+    return tuple(map(_make_input_signature_hashable, elem))
 
   try:
     hash(elem)
@@ -119,15 +117,17 @@ def _make_input_signature_hashable(elem, variable_map=None):
     v = elem()
 
     if resource_variable_ops.is_resource_variable(v):
-      idx = variable_map.get(id(v))
-      if idx is None:
-        idx = len(variable_map)
-        variable_map[id(v)] = idx
-
-      # We include the class name to avoid having different types of variables
-      # having the same hash. We Also include the variable index which allows
-      # us to return a different hash if variables have been aliased in a call.
-      return v.__class__, tensor_spec.TensorSpec(v.shape, v.dtype), idx
+      # We special case variables here to use unique_id as the cache key. This
+      # ensures we have to retrace whenever a different variable is passed in.
+      # This is needed to support cases where the user may use the id of a
+      # variable in the function perhaps as a lookup in a dictionary.
+      #
+      # This choice leads to more retracing when we could have possibly used the
+      # shape and dtype instead. However, we expect the number of variables in a
+      # program to be bounded, and correspondingly the number of retraces.
+      #
+      # Note we also include the class name to avoid collisions with strings.
+      return v.__class__, v._unique_id  # pylint: disable=protected-access
 
     if _is_ndarray(v):
       # Numpy arrays are not hashable, but when calling functions we treat them
@@ -1487,6 +1487,12 @@ class ConcreteFunction(object):
     self._func_graph = func_graph
     self._captured_inputs = self._func_graph.external_captures
     self._captured_closures = self._func_graph.deferred_external_captures
+    structured_outputs = self._func_graph.structured_outputs
+    self._ndarrays_list = (
+        isinstance(structured_outputs, (list, tuple)) and
+        structured_outputs and
+        all([isinstance(o, np_arrays.ndarray) for o in structured_outputs]))
+    self._ndarray_singleton = isinstance(structured_outputs, np_arrays.ndarray)
 
     # function_spec defines the structured signature.
     self._set_function_spec(function_spec)
@@ -2153,9 +2159,15 @@ class ConcreteFunction(object):
     if self._func_graph.structured_outputs is None:
       return result
 
+    if result:
+      if self._ndarrays_list:
+        return [np_arrays.tensor_to_ndarray(o) for o in result]
+      elif self._ndarray_singleton:
+        return np_arrays.tensor_to_ndarray(result[0])
+
     # Replace outputs with results, skipping over any 'None' values.
-    outputs_list = nest.flatten(self._func_graph.structured_outputs,
-                                expand_composites=True)
+    outputs_list = nest.flatten(
+        self._func_graph.structured_outputs, expand_composites=True)
     j = 0
     for i, o in enumerate(outputs_list):
       if o is not None:
@@ -2626,7 +2638,9 @@ def _is_ndarray(value):
       # For legacy reasons we do not automatically promote Numpy strings.
       or isinstance(value, np.str_)
       # NumPy dtypes have __array__ as unbound methods.
-      or isinstance(value, type))
+      or isinstance(value, type)
+      # CompositeTensors should be flattened instead.
+      or isinstance(value, composite_tensor.CompositeTensor))
 
 
 def _convert_numpy_inputs(inputs):
@@ -2981,9 +2995,10 @@ class Function(object):
     if not executing_eagerly:
       # We want to force function retracing for each different
       # XLAControlFlowContext, so add `xla_context_id` to the cache key.
-      tpu_context = _enclosing_xla_context()
-      if tpu_context is not None:
-        xla_context_id = id(tpu_context)
+      xla_context = _enclosing_xla_context()
+      if xla_context is not None and \
+            xla_context.RequiresUniqueFunctionRetracing():
+        xla_context_id = id(xla_context)
 
       with ops.init_scope():
         # The graph, or whether we're executing eagerly, should be a part of the
diff --git a/tensorflow/python/eager/function_test.py b/tensorflow/python/eager/function_test.py
index f6a36701deb..3c42d95e437 100644
--- a/tensorflow/python/eager/function_test.py
+++ b/tensorflow/python/eager/function_test.py
@@ -18,6 +18,7 @@ from __future__ import division
 from __future__ import print_function
 
 import collections
+import copy
 import functools
 import itertools
 import multiprocessing.pool
@@ -223,6 +224,13 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
 
       return read0, read1, read2, read3
 
+    arg_attrs = read_var.get_concrete_function().function_def.arg_attr
+    self.assertLen(arg_attrs, 2)
+    self.assertEqual(arg_attrs[0].attr['_composite_device'].s,
+                     compat.as_bytes(packed_var_0.device))
+    self.assertEqual(arg_attrs[1].attr['_composite_device'].s,
+                     compat.as_bytes(packed_var_1.device))
+
     self.assertAllEqual(read_var(), (1 + 5, 2 + 5, 3 + 6, 4 + 6))
 
   def testImplementsAttributeBasic(self):
@@ -2930,30 +2938,57 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     # should only get a miss if the aliasing changed.
     defined(x, y, z)
     self.assertLen(total_function_cache(defined), 1)
-
-    # Calling again is a cache hit
     defined(x, y, z)
     self.assertLen(total_function_cache(defined), 1)
 
-    # Re-arranging arguments doesn't change signature
+    # Re-arranging arguments causes cache miss
     defined(z, y, x)
-    self.assertLen(total_function_cache(defined),
-                   1 if ops.Tensor._USE_EQUALITY else 2)
+    self.assertLen(total_function_cache(defined), 2)
+    defined(z, y, x)
+    self.assertLen(total_function_cache(defined), 2)
 
     # Aliasing causes cache miss
     defined(x, x, z)
-    self.assertLen(total_function_cache(defined),
-                   2 if ops.Tensor._USE_EQUALITY else 3)
+    self.assertLen(total_function_cache(defined), 3)
+    defined(x, x, z)
+    self.assertLen(total_function_cache(defined), 3)
 
-    # Re-arranging arguments doesn't change signature
+    # Re-arranging arguments causes cache miss
     defined(y, y, z)
-    self.assertLen(total_function_cache(defined),
-                   2 if ops.Tensor._USE_EQUALITY else 4)
+    self.assertLen(total_function_cache(defined), 4)
+    defined(y, y, z)
+    self.assertLen(total_function_cache(defined), 4)
 
     # Different alias positions causes cache miss
     defined(z, y, y)
-    self.assertLen(total_function_cache(defined),
-                   3 if ops.Tensor._USE_EQUALITY else 5)
+    self.assertLen(total_function_cache(defined), 5)
+    defined(z, y, y)
+    self.assertLen(total_function_cache(defined), 5)
+
+    x_copy = copy.deepcopy(x)
+
+    # Deep copy causes cache miss
+    defined(x_copy, y, z)
+    self.assertLen(total_function_cache(defined), 6)
+    defined(x_copy, y, z)
+    self.assertLen(total_function_cache(defined), 6)
+
+  def testVariableRetracing(self):
+    v1 = variables.Variable(1.)
+    v2 = variables.Variable(1.)
+    v3 = copy.deepcopy(variables.Variable(1.))
+
+    var_dict = {id(v1): constant_op.constant(1),
+                id(v2): constant_op.constant(2),
+                id(v3): constant_op.constant(3)}
+
+    @function.defun
+    def lookup_tensor(v):
+      return var_dict[id(v)]
+
+    self.assertEqual(1, lookup_tensor(v1).numpy())
+    self.assertEqual(2, lookup_tensor(v2).numpy())
+    self.assertEqual(3, lookup_tensor(v3).numpy())
 
   def testDecoratedMethodInspect(self):
 
diff --git a/tensorflow/python/eager/pywrap_gradient_exclusions.cc b/tensorflow/python/eager/pywrap_gradient_exclusions.cc
index 72757ae41e2..7da45e36118 100644
--- a/tensorflow/python/eager/pywrap_gradient_exclusions.cc
+++ b/tensorflow/python/eager/pywrap_gradient_exclusions.cc
@@ -50,7 +50,7 @@ auto OpGradientInfoInit(const T &a) {
 
 absl::optional<tensorflow::gtl::FlatSet<int>> OpGradientUnusedInputIndices(
     const tensorflow::string &op_name) {
-  static std::array<OpIndexInfo, 348> a = {{
+  static std::array<OpIndexInfo, 349> a = {{
       {"Acosh"},
       {"AllToAll", 1, {0}},
       {"ApproximateEqual"},
@@ -326,6 +326,7 @@ absl::optional<tensorflow::gtl::FlatSet<int>> OpGradientUnusedInputIndices(
       {"StackPop"},
       {"StackPush"},
       {"StatelessMultinomial"},
+      {"StatelessParameterizedTruncatedNormal", 1, {1}},
       {"StatelessRandomBinomial"},
       {"StatelessRandomGammaV2", 1, {1}},
       {"StatelessRandomNormal"},
@@ -411,7 +412,7 @@ absl::optional<tensorflow::gtl::FlatSet<int>> OpGradientUnusedInputIndices(
 
 absl::optional<tensorflow::gtl::FlatSet<int>> OpGradientUnusedOutputIndices(
     const tensorflow::string &op_name) {
-  static std::array<OpIndexInfo, 461> a = {{
+  static std::array<OpIndexInfo, 465> a = {{
       {"Abs"},
       {"AccumulateNV2"},
       {"Acos"},
@@ -443,6 +444,10 @@ absl::optional<tensorflow::gtl::FlatSet<int>> OpGradientUnusedOutputIndices(
       {"BatchNormWithGlobalNormalization"},
       {"BatchToSpace"},
       {"BatchToSpaceND"},
+      {"BesselI0"},
+      {"BesselJ0"},
+      {"BesselK0"},
+      {"BesselY0"},
       {"Betainc"},
       {"BiasAdd"},
       {"BiasAddGrad"},
diff --git a/tensorflow/python/eager/pywrap_tensor.cc b/tensorflow/python/eager/pywrap_tensor.cc
index 031545531f1..0789eab6270 100644
--- a/tensorflow/python/eager/pywrap_tensor.cc
+++ b/tensorflow/python/eager/pywrap_tensor.cc
@@ -180,6 +180,15 @@ int ConvertDeviceName(PyObject* obj, const char** dst) {
   return 1;
 }
 
+void RaiseExceptionTypeFromTFStatus(TF_Status* status) {
+  TF_Code code = TF_GetCode(status);
+  PyObject* exception = tensorflow::PyExceptionRegistry::Lookup(code);
+  PyErr_SetObject(exception,
+                  pybind11::make_tuple(pybind11::none(), pybind11::none(),
+                                       TF_Message(status))
+                      .ptr());
+}
+
 }  // namespace
 
 namespace tensorflow {
@@ -305,13 +314,7 @@ TFE_TensorHandle* ConvertToEagerTensorUncached(TFE_Context* ctx,
                                                     device_name, status.get()));
     const TF_Code code = TF_GetCode(status.get());
     if (code != TF_OK) {
-      // Instead of raising a generic RuntimeError, raise an exception type
-      // based on the status error code.
-      PyObject* exception = PyExceptionRegistry::Lookup(code);
-      PyErr_SetObject(exception,
-                      pybind11::make_tuple(pybind11::none(), pybind11::none(),
-                                           TF_Message(status.get()))
-                          .ptr());
+      RaiseExceptionTypeFromTFStatus(status.get());
       return nullptr;
     }
   }
@@ -512,7 +515,9 @@ static PyObject* EagerTensor_datatype_enum(EagerTensor* self) {
 static PyObject* EagerTensor_shape_tuple(EagerTensor* self) {
   auto handle = self->handle;
   int n = TFE_TensorHandleNumDims(handle, &self->status);
-  if (MaybeRaiseExceptionFromTFStatus(&self->status, nullptr)) {
+  TF_Code code = TF_GetCode(&self->status);
+  if (code != TF_OK) {
+    RaiseExceptionTypeFromTFStatus(&self->status);
     // Cleanup self->status before returning.
     self->status.status = tensorflow::Status::OK();
     return nullptr;
@@ -522,13 +527,18 @@ static PyObject* EagerTensor_shape_tuple(EagerTensor* self) {
   for (int i = 0; i < n; ++i) {
     PyObject* dim =
         PyLong_FromLongLong(TFE_TensorHandleDim(handle, i, &self->status));
-    if (MaybeRaiseExceptionFromTFStatus(&self->status, nullptr) ||
-        dim == nullptr || PyTuple_SetItem(shape, i, dim) != 0) {
+    code = TF_GetCode(&self->status);
+    if (code != TF_OK || dim == nullptr ||
+        PyTuple_SetItem(shape, i, dim) != 0) {
+      if (code != TF_OK) {
+        RaiseExceptionTypeFromTFStatus(&self->status);
+      } else {
+        PyErr_SetString(PyExc_RuntimeError, "Error while creating shape");
+      }
       // Cleanup self->status before returning.
       self->status.status = tensorflow::Status::OK();
       Py_DECREF(shape);
       if (dim != nullptr) Py_DECREF(dim);
-      PyErr_SetString(PyExc_RuntimeError, "Error while creating shape");
       return nullptr;
     }
   }
diff --git a/tensorflow/python/eager/pywrap_tfe_src.cc b/tensorflow/python/eager/pywrap_tfe_src.cc
index 639f623bd1a..b9ff474caab 100644
--- a/tensorflow/python/eager/pywrap_tfe_src.cc
+++ b/tensorflow/python/eager/pywrap_tfe_src.cc
@@ -2008,7 +2008,7 @@ bool ListContainsNone(PyObject* list) {
 
 static PyTapeTensor TapeTensorFromTensor(PyObject* tensor) {
   if (EagerTensor_CheckExact(tensor)) {
-    tensorflow::AbstractTensorHandleInterface* handle =
+    tensorflow::ImmediateExecutionTensorHandle* handle =
         tensorflow::unwrap(EagerTensor_Handle(tensor));
     tensorflow::int64 id = PyEagerTensor_ID(tensor);
     tensorflow::DataType dtype =
@@ -3869,7 +3869,7 @@ tensorflow::Status TFE_Py_EncodeTensor(PyObject* arg,
                                        bool include_tensor_ranks_only,
                                        EncodeResult* result) {
   if (EagerTensor_CheckExact(arg)) {
-    tensorflow::AbstractTensorHandleInterface* handle =
+    tensorflow::ImmediateExecutionTensorHandle* handle =
         tensorflow::unwrap(EagerTensor_Handle(arg));
 
     absl::StrAppend(&result->str, kDType,
diff --git a/tensorflow/python/framework/auto_control_deps.py b/tensorflow/python/framework/auto_control_deps.py
index 51dcb248b11..4b47735e0bf 100644
--- a/tensorflow/python/framework/auto_control_deps.py
+++ b/tensorflow/python/framework/auto_control_deps.py
@@ -100,7 +100,7 @@ _ORDER_INSENSITIVE_STATEFUL_OPS = [
     "CudnnRNNV2", "CudnnRNNV3", "CudnnRNNBackpropV2", "CudnnRNNBackpropV3",
     "EnqueueTPUEmbeddingSparseBatch", "EnqueueTPUEmbeddingIntegerBatch",
     "EnqueueTPUEmbeddingSparseTensorBatch",
-    "EnqueueTPUEmbeddingRaggedTensorBatch"
+    "EnqueueTPUEmbeddingRaggedTensorBatch", "RestoreV2", "SaveV2"
 ]
 # LINT.ThenChange(//tensorflow/core/grappler/optimizers/function_optimizer.cc)
 
diff --git a/tensorflow/python/framework/config.py b/tensorflow/python/framework/config.py
index 9ff16f2a327..0962b9a8a70 100644
--- a/tensorflow/python/framework/config.py
+++ b/tensorflow/python/framework/config.py
@@ -18,11 +18,42 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python import _pywrap_tf32_execution
 from tensorflow.python.eager import context
 from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
 
+# No tf_export until TF is built against CUDA11 which is required for TF32.
+def tensor_float_32_execution_allowed():
+  """Get if TensorFloat-32 operations are enabled on supported hardware.
+
+  Returns:
+    True if TensorFloat-32 execution is enabled and False otherwise.
+  """
+  return _pywrap_tf32_execution.is_allowed()
+
+
+# No tf_export until TF is built against CUDA11 which is required for TF32.
+def allow_tensor_float_32_execution(allowed):
+  """Allow use of TensorFloat-32 with float32 ops on supported hardware.
+
+  TensorFloat-32 is a math mode introduced with the NVIDIA Ampere architecture.
+  TensorFloat-32 kernels take float32 inputs and produce float32 outputs.
+  Internally, the inputs are cast to a custom representation with 10-bit
+  mantissa (similar to float16) and 8-bit exponent (similar to float32) and are
+  executed using TensorCores with float32 accumulation. For more information,
+  see https://blogs.nvidia.com/blog/2020/05/14/tensorfloat-32-precision-format/.
+
+  TensorFloat-32 execution is disabled by default, but this may change in a
+  future version.
+
+  Args:
+    allowed: whether to allow TensorFloat-32 execution
+  """
+  _pywrap_tf32_execution.allow(allowed)
+
+
 @tf_export('config.threading.get_intra_op_parallelism_threads')
 def get_intra_op_parallelism_threads():
   """Get number of threads used within an individual op for parallelism.
diff --git a/tensorflow/python/framework/convert_to_constants.py b/tensorflow/python/framework/convert_to_constants.py
index 87c74c3263d..4c3cbb06bf1 100644
--- a/tensorflow/python/framework/convert_to_constants.py
+++ b/tensorflow/python/framework/convert_to_constants.py
@@ -235,7 +235,8 @@ class _Node(_Convertible):
       return _If(node, function, enclosing_graph)
     elif node.op in ["While", "StatelessWhile"]:
       return _While(node, function, enclosing_graph)
-    elif node.op in ["Enter", "Exit", "Identity", "NextIteration", "Switch"]:
+    elif node.op in [
+        "Enter", "Exit", "Identity", "NextIteration", "Switch", "_SwitchN"]:
       return _Intermediate(node, function, enclosing_graph)
     else:
       return _Node(node, function, enclosing_graph)
diff --git a/tensorflow/python/framework/convert_to_constants_test.py b/tensorflow/python/framework/convert_to_constants_test.py
index b1e11003939..7252082d084 100644
--- a/tensorflow/python/framework/convert_to_constants_test.py
+++ b/tensorflow/python/framework/convert_to_constants_test.py
@@ -486,6 +486,40 @@ class VariablesToConstantsTest(test.TestCase):
     root, output_func = self._freezeModel(model)
     self._testConvertedFunction(root, root.f, output_func, input_data)
 
+  @test_util.run_v2_only
+  def testSwitchCase(self):
+    """Test a switch_case statement."""
+    input_data = {
+        "i": constant_op.constant(np.random.randint(0, 3, dtype=np.int32)),
+        "x": constant_op.constant(
+            np.asarray(np.random.random_sample((10, 3)), dtype=np.float32)),
+    }
+
+    w0 = variables.Variable(np.random.random_sample((3, 4)), dtype=np.float32)
+    w1 = variables.Variable(np.random.random_sample((3, 4)), dtype=np.float32)
+    w2 = variables.Variable(np.random.random_sample((4,)), dtype=np.float32)
+
+    def branch0(x):
+      return math_ops.matmul(x, w0)
+
+    def branch1(x):
+      return math_ops.matmul(x, w1)
+
+    def branch2(x):
+      x = array_ops.pad(x, [[0, 0], [0, 1]])
+      return x + w2
+
+    @def_function.function(input_signature=[
+        tensor_spec.TensorSpec(shape=[], dtype=dtypes.int32),
+        tensor_spec.TensorSpec(shape=[10, 3], dtype=dtypes.float32),
+    ])
+    def model(i, x):
+      return control_flow_ops.switch_case(i, [
+          lambda: branch0(x), lambda: branch1(x), lambda: branch2(x)])
+
+    root, output_func = self._freezeModel(model)
+    self._testConvertedFunction(root, root.f, output_func, input_data)
+
 
 class ConvertVariablesToConstantsSessionTest(test.TestCase):
 
diff --git a/tensorflow/python/framework/dtypes.py b/tensorflow/python/framework/dtypes.py
index 994a7eea494..9eeae83c68a 100644
--- a/tensorflow/python/framework/dtypes.py
+++ b/tensorflow/python/framework/dtypes.py
@@ -607,6 +607,9 @@ assert len(_ANY_TO_TF) == sum(
 def as_dtype(type_value):
   """Converts the given `type_value` to a `DType`.
 
+  Note: `DType` values are interned. When passed a new `DType` object,
+  `as_dtype` always returns the interned value.
+
   Args:
     type_value: A value that can be converted to a `tf.DType` object. This may
       currently be a `tf.DType` object, a [`DataType`
@@ -620,7 +623,7 @@ def as_dtype(type_value):
     TypeError: If `type_value` cannot be converted to a `DType`.
   """
   if isinstance(type_value, DType):
-    return type_value
+    return _INTERN_TABLE[type_value.as_datatype_enum]
 
   if isinstance(type_value, np.dtype):
     try:
diff --git a/tensorflow/python/framework/dtypes_test.py b/tensorflow/python/framework/dtypes_test.py
index 041cc5280cd..1b7e02b6179 100644
--- a/tensorflow/python/framework/dtypes_test.py
+++ b/tensorflow/python/framework/dtypes_test.py
@@ -325,15 +325,19 @@ class TypesTest(test_util.TensorFlowTestCase):
     for enum in dtypes._TYPE_TO_STRING:
       dtype = dtypes.DType(enum)
       ctor, args = dtype.__reduce__()
-      self.assertEquals(ctor, dtypes.as_dtype)
-      self.assertEquals(args, (dtype.name,))
+      self.assertEqual(ctor, dtypes.as_dtype)
+      self.assertEqual(args, (dtype.name,))
       reconstructed = ctor(*args)
-      self.assertEquals(reconstructed, dtype)
+      self.assertEqual(reconstructed, dtype)
 
   def testAsDtypeInvalidArgument(self):
     with self.assertRaises(TypeError):
       dtypes.as_dtype((dtypes.int32, dtypes.float32))
 
+  def testAsDtypeReturnsInternedVersion(self):
+    dt = dtypes.DType(types_pb2.DT_VARIANT)
+    self.assertIs(dtypes.as_dtype(dt), dtypes.variant)
+
 
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/python/framework/func_graph.py b/tensorflow/python/framework/func_graph.py
index b0f8821b17f..e8e8fcbf081 100644
--- a/tensorflow/python/framework/func_graph.py
+++ b/tensorflow/python/framework/func_graph.py
@@ -646,6 +646,13 @@ class FuncGraph(ops.Graph):
     if capture is None:
       placeholder = _create_substitute_placeholder(
           tensor, name=name, dtype=tensor.dtype, shape=shape)
+      # Record the composite device as an attribute to the placeholder.
+      # This attribute would be propogated into the arg_attr of the FunctionDef.
+      # Currently, a packed eager tensor is always placed on a CompositeDevice.
+      if isinstance(tensor, ops.EagerTensor) and tensor.is_packed:
+        placeholder.op._set_attr(  # pylint: disable=protected-access
+            "_composite_device",
+            attr_value_pb2.AttrValue(s=compat.as_bytes(tensor.device)))
       self.add_capture(tensor, placeholder)
     else:
       placeholder = capture[1]
diff --git a/tensorflow/python/framework/python_op_gen.cc b/tensorflow/python/framework/python_op_gen.cc
index ca0c5d9ef1a..0f84c6a063d 100644
--- a/tensorflow/python/framework/python_op_gen.cc
+++ b/tensorflow/python/framework/python_op_gen.cc
@@ -981,9 +981,9 @@ void GenEagerPythonOp::AddRawOpExport(const string& parameters) {
                      function_name_, "))\n");
 }
 
-string GetPythonOps(const OpList& ops, const ApiDefMap& api_defs,
-                    const std::vector<string>& hidden_ops,
-                    const string& source_file_name = "") {
+string GetPythonOpsImpl(const OpList& ops, const ApiDefMap& api_defs,
+                        const std::vector<string>& hidden_ops,
+                        const string& source_file_name = "") {
   string result;
   // Header
   // TODO(josh11b): Mention the library for which wrappers are being generated.
@@ -1069,11 +1069,17 @@ from tensorflow.python.util.tf_export import tf_export
 
 }  // namespace
 
+string GetPythonOps(const OpList& ops, const ApiDefMap& api_defs,
+                    const std::vector<string>& hidden_ops,
+                    const string& source_file_name) {
+  return GetPythonOpsImpl(ops, api_defs, hidden_ops, source_file_name);
+}
+
 void PrintPythonOps(const OpList& ops, const ApiDefMap& api_defs,
                     const std::vector<string>& hidden_ops,
                     const string& source_file_name) {
   printf("%s",
-         GetPythonOps(ops, api_defs, hidden_ops, source_file_name).c_str());
+         GetPythonOpsImpl(ops, api_defs, hidden_ops, source_file_name).c_str());
 }
 
 string GetPythonWrappers(const char* op_list_buf, size_t op_list_len) {
@@ -1081,7 +1087,7 @@ string GetPythonWrappers(const char* op_list_buf, size_t op_list_len) {
   ops.ParseFromArray(op_list_buf, op_list_len);
 
   ApiDefMap api_def_map(ops);
-  return GetPythonOps(ops, api_def_map, {});
+  return GetPythonOpsImpl(ops, api_def_map, {});
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/python/framework/python_op_gen.h b/tensorflow/python/framework/python_op_gen.h
index 22fcc452fbb..f1cd6e49013 100644
--- a/tensorflow/python/framework/python_op_gen.h
+++ b/tensorflow/python/framework/python_op_gen.h
@@ -23,8 +23,20 @@ limitations under the License.
 
 namespace tensorflow {
 
+// Returns a string containing the generated Python code for the given Ops.
+// ops is a protobuff, typically generated using OpRegistry::Global()->Export.
+// api_defs is typically constructed directly from ops.
 // hidden_ops should be a list of Op names that should get a leading _
-// in the output. Prints the output to stdout.
+// in the output.
+// source_file_name is optional and contains the name of the original C++ source
+// file where the ops' REGISTER_OP() calls reside.
+string GetPythonOps(const OpList& ops, const ApiDefMap& api_defs,
+                    const std::vector<string>& hidden_ops,
+                    const string& source_file_name);
+
+// Prints the output of GetPrintOps to stdout.
+// hidden_ops should be a list of Op names that should get a leading _
+// in the output.
 // Optional fourth argument is the name of the original C++ source file
 // where the ops' REGISTER_OP() calls reside.
 void PrintPythonOps(const OpList& ops, const ApiDefMap& api_defs,
diff --git a/tensorflow/c/experimental/saved_model/core/ops/owned_eager_op.h b/tensorflow/python/framework/python_op_gen_test.cc
similarity index 53%
rename from tensorflow/c/experimental/saved_model/core/ops/owned_eager_op.h
rename to tensorflow/python/framework/python_op_gen_test.cc
index c6b21578820..5185086fdd3 100644
--- a/tensorflow/c/experimental/saved_model/core/ops/owned_eager_op.h
+++ b/tensorflow/python/framework/python_op_gen_test.cc
@@ -13,30 +13,30 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_OWNED_EAGER_OP_H_
-#define TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_OWNED_EAGER_OP_H_
+#include "tensorflow/python/framework/python_op_gen.h"
 
-#include <memory>
-
-#include "tensorflow/c/eager/operation_interface.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_def.pb.h"
+#include "tensorflow/core/framework/op_gen_lib.h"
+#include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
-namespace internal {
+namespace {
 
-struct AbstractOperationInterfaceDeleter {
-  void operator()(AbstractOperationInterface* p) const {
-    if (p != nullptr) {
-      p->Release();
-    }
-  }
-};
+TEST(PythonOpGen, Basic) {
+  OpList ops;
+  OpRegistry::Global()->Export(false, &ops);
 
-}  // namespace internal
+  ApiDefMap api_def_map(ops);
 
-using AbstractOpPtr =
-    std::unique_ptr<AbstractOperationInterface,
-                    internal::AbstractOperationInterfaceDeleter>;
+  string code = GetPythonOps(ops, api_def_map, {}, "");
 
+  EXPECT_TRUE(absl::StrContains(code, "def case"));
+
+  // TODO(mdan): Add tests to verify type annotations are correctly added.
+}
+
+// TODO(mdan): Include more tests with synhtetic ops and api defs.
+
+}  // namespace
 }  // namespace tensorflow
-
-#endif  // TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_OWNED_EAGER_OP_H_
diff --git a/tensorflow/python/framework/tensor_shape.py b/tensorflow/python/framework/tensor_shape.py
index fd229b6691a..20508f37eb7 100644
--- a/tensorflow/python/framework/tensor_shape.py
+++ b/tensorflow/python/framework/tensor_shape.py
@@ -184,10 +184,14 @@ class Dimension(object):
 
   def __init__(self, value):
     """Creates a new Dimension with the given value."""
-    if value is None:
+    if isinstance(value, int):  # Most common case.
+      if value < 0:
+        raise ValueError("Dimension %d must be >= 0" % value)
+      self._value = value
+    elif value is None:
       self._value = None
     elif isinstance(value, Dimension):
-      self._value = value
+      self._value = value._value
     else:
       try:
         # int(...) compensates for the int/long dichotomy on Python 2.X.
@@ -748,7 +752,9 @@ class TensorShape(object):
     Raises:
       TypeError: If dims cannot be converted to a list of dimensions.
     """
-    if dims is None:
+    if isinstance(dims, (tuple, list)):  # Most common case.
+      self._dims = [Dimension(d) for d in dims]
+    elif dims is None:
       self._dims = None
     elif isinstance(dims, tensor_shape_pb2.TensorShapeProto):
       if dims.unknown_rank:
diff --git a/tensorflow/python/framework/tensor_util.py b/tensorflow/python/framework/tensor_util.py
index 968b635250a..3d5a16dd0f6 100644
--- a/tensorflow/python/framework/tensor_util.py
+++ b/tensorflow/python/framework/tensor_util.py
@@ -525,7 +525,7 @@ def make_tensor_proto(values, dtype=None, shape=None, verify_shape=False,
     if nparray.size * nparray.itemsize >= (1 << 31):
       raise ValueError(
           "Cannot create a tensor proto whose content is larger than 2GB.")
-    tensor_proto.tensor_content = nparray.tostring()
+    tensor_proto.tensor_content = nparray.tobytes()
     return tensor_proto
 
   # If we were not given values as a numpy array, compute the proto_values
diff --git a/tensorflow/python/framework/test_util.py b/tensorflow/python/framework/test_util.py
index a46bb7c9bda..8ddbcf34f3b 100644
--- a/tensorflow/python/framework/test_util.py
+++ b/tensorflow/python/framework/test_util.py
@@ -1933,6 +1933,9 @@ class TensorFlowTestCase(googletest.TestCase):
       # disable it here.
       pywrap_tf_session.TF_SetXlaConstantFoldingDisabled(True)
 
+    if is_mlir_bridge_enabled():
+      context.context().enable_mlir_bridge = True
+
     self._threads = []
     self._tempdir = None
     self._cached_session = None
diff --git a/tensorflow/python/grappler/auto_mixed_precision_test.py b/tensorflow/python/grappler/auto_mixed_precision_test.py
index 494f6fc78fc..f2b0985b348 100644
--- a/tensorflow/python/grappler/auto_mixed_precision_test.py
+++ b/tensorflow/python/grappler/auto_mixed_precision_test.py
@@ -19,8 +19,8 @@ from __future__ import division
 from __future__ import print_function
 
 import os
-import unittest
 
+from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.core.framework import types_pb2
@@ -209,7 +209,7 @@ def _make_node_with_color(color, input_tensor, name=None):
   if color == 'c':  # Clear node
     return nn.relu(input_tensor, name=name)
   if color == 'b':  # Black node
-    return math_ops.sqrt(math_ops.pow(input_tensor, 2.), name=name)
+    return math_ops.pow(math_ops.pow(input_tensor, 2.), 0.5, name=name)
   raise ValueError('Invalid node color: ' + str(color))
 
 
@@ -231,18 +231,21 @@ def _build_simple_loop_graph(inp_colors, body_colors, out_colors):
   return a
 
 
-def _get_config(auto_mixed_precision=True):
+def _get_config(auto_mixed_precision_mode):
   """Returns a ConfigProto with auto mixed precision enabled if appropriate."""
-  if auto_mixed_precision:
-    rewrite_config = rewriter_config_pb2.RewriterConfig(
-        auto_mixed_precision=rewriter_config_pb2.RewriterConfig.ON,
-        # do not remove duplicated nodes
-        arithmetic_optimization=rewriter_config_pb2.RewriterConfig.OFF)
+  rewrite_config = rewriter_config_pb2.RewriterConfig(
+      # do not remove duplicated nodes
+      arithmetic_optimization=rewriter_config_pb2.RewriterConfig.OFF,
+      # do not turn Conv2D and other nodes into _FusedConv2D
+      remapping=rewriter_config_pb2.RewriterConfig.OFF,
+  )
+  if auto_mixed_precision_mode == 'cuda':
+    rewrite_config.auto_mixed_precision = rewriter_config_pb2.RewriterConfig.ON
+  elif auto_mixed_precision_mode == 'mkl':
+    rewrite_config.auto_mixed_precision_mkl = (
+        rewriter_config_pb2.RewriterConfig.ON)
   else:
-    rewrite_config = rewriter_config_pb2.RewriterConfig(
-        auto_mixed_precision=rewriter_config_pb2.RewriterConfig.OFF,
-        # do not remove duplicated nodes
-        arithmetic_optimization=rewriter_config_pb2.RewriterConfig.OFF)
+    assert auto_mixed_precision_mode is None
   rewrite_config.min_graph_nodes = -1
   graph_options = config_pb2.GraphOptions(
       rewrite_options=rewrite_config, build_cost_model=1)
@@ -255,19 +258,33 @@ def _is_cast_to_fp16(node_name):
   return node_name.endswith('-CastToFp16-AutoMixedPrecision')
 
 
+def _is_cast_to_bf16(node_name):
+  return node_name.endswith('-CastToBf16-AutoMixedPrecision')
+
+
 def _is_cast_to_fp32(node_name):
   return node_name.endswith('-CastToFp32-AutoMixedPrecision')
 
 
-def _count_casts(nodes):
+def _count_casts(mode, nodes):
+  """Counts the number of casts to f16 and fp32."""
   num_to_fp16 = 0
+  num_to_bf16 = 0
   num_to_fp32 = 0
   for node in nodes:
     if _is_cast_to_fp16(node.name):
       num_to_fp16 += 1
+    if _is_cast_to_bf16(node.name):
+      num_to_bf16 += 1
     elif _is_cast_to_fp32(node.name):
       num_to_fp32 += 1
-  return num_to_fp16, num_to_fp32
+  if mode == 'cuda':
+    assert num_to_bf16 == 0
+    return num_to_fp16, num_to_fp32
+  else:
+    assert mode == 'mkl'
+    assert num_to_fp16 == 0
+    return num_to_bf16, num_to_fp32
 
 
 def _build_node_map(nodes):
@@ -303,7 +320,7 @@ def _example_noninlined_funcdef(features):
   return features * math_ops.sigmoid(features)
 
 
-class AutoMixedPrecisionTest(test.TestCase):
+class AutoMixedPrecisionTest(test.TestCase, parameterized.TestCase):
   """Tests the Grappler auto mixed precision optimizer."""
   IGNORE_PERF_VAR = 'TF_AUTO_MIXED_PRECISION_GRAPH_REWRITE_IGNORE_PERFORMANCE'
 
@@ -311,8 +328,8 @@ class AutoMixedPrecisionTest(test.TestCase):
 
   def setUp(self):
     super(AutoMixedPrecisionTest, self).setUp()
-    # Enable the tests to be run on pre-Volta GPUs by telling the grappler pass
-    # to ignore performance and always transform the graph.
+    # Enable the CUDA tests to be run on pre-Volta GPUs by telling the grappler
+    # pass to ignore performance and always transform the graph.
     self._original_ignore_perf_value = os.getenv(self.IGNORE_PERF_VAR)
     os.environ[self.IGNORE_PERF_VAR] = '1'
 
@@ -323,24 +340,33 @@ class AutoMixedPrecisionTest(test.TestCase):
       del os.environ[self.IGNORE_PERF_VAR]
     super(AutoMixedPrecisionTest, self).tearDown()
 
-  def _assert_output_fp16(self, node_map, node_name, output_port=0):
-    self.assertEqual(node_map[node_name].output_info[output_port].dtype,
-                     types_pb2.DT_HALF)
+  def _lower_precision_dtype(self, mode):
+    return dtypes.float16 if mode == 'cuda' else dtypes.bfloat16
 
-  def _run(self, fetches):
+  def _assert_output_f16(self, mode, node_map, node_name, output_port=0):
+    self.assertEqual(node_map[node_name].output_info[output_port].dtype,
+                     self._lower_precision_dtype(mode).as_datatype_enum)
+
+  def _run(self, mode, fetches):
     """Runs the graph and returns the evaluation of the fetches."""
-    with session.Session(config=_get_config(False)) as sess:
+    with session.Session(config=_get_config(None)) as sess:
       sess.run(variables.global_variables_initializer())
       output_val_ref = self.evaluate(fetches)
 
-    with session.Session(config=_get_config()) as sess:
+    with session.Session(config=_get_config(mode)) as sess:
       sess.run(variables.global_variables_initializer())
       metadata = config_pb2.RunMetadata()
       output_val = sess.run(fetches, run_metadata=metadata)
 
     return output_val_ref, output_val, metadata.cost_graph
 
-  def _run_simple_loop_test(self, inp, body, out):
+  def _maybe_skip(self, mode):
+    if mode == 'cuda' and not test.is_gpu_available(cuda_only=True):
+      self.skipTest('No GPU is available')
+    if mode == 'mkl' and not test_util.IsMklEnabled():
+      self.skipTest('MKL is not enabled')
+
+  def _run_simple_loop_test(self, mode, inp, body, out):
     """Runs a test of a simple loop.
 
     The loop has different node colors in different sections of the graph. The
@@ -352,6 +378,7 @@ class AutoMixedPrecisionTest(test.TestCase):
     inp -> loop [ body ] -> out.
 
     Args:
+      mode: Either 'cuda' or 'mkl'.
       inp: A string of letters indicating the colors and expected dtypes of the
         input nodes.
       body: A string of letters indicating the colors and expected dtypes of the
@@ -359,398 +386,446 @@ class AutoMixedPrecisionTest(test.TestCase):
       out: A string of letters indicating the colors and expected dtypes of the
         output nodes.
     """
-    if test.is_gpu_available(cuda_only=True):
-      random_seed.set_random_seed(0)
-      expected_types = []
-      for section in [inp, body, out]:
-        section_expected_types = []
-        for color in section:
-          if color.isupper():
-            expected_type = types_pb2.DT_HALF
-          else:
-            expected_type = types_pb2.DT_FLOAT
-          section_expected_types.append(expected_type)
-        expected_types.append(section_expected_types)
+    self._maybe_skip(mode)
+    random_seed.set_random_seed(0)
+    expected_types = []
+    for section in [inp, body, out]:
+      section_expected_types = []
+      for color in section:
+        if color.isupper():
+          expected_type = self._lower_precision_dtype(mode).as_datatype_enum
+        else:
+          expected_type = types_pb2.DT_FLOAT
+        section_expected_types.append(expected_type)
+      expected_types.append(section_expected_types)
 
-      a = _build_simple_loop_graph(inp, body, out)
-      output_val_ref, output_val, cost_graph = self._run(a)
-      node_map = _build_node_map(cost_graph.node)
+    a = _build_simple_loop_graph(inp, body, out)
+    output_val_ref, output_val, cost_graph = self._run(mode, a)
+    node_map = _build_node_map(cost_graph.node)
 
-      section_names = ['input', 'while/body', 'output']
-      all_types_correct = True
-      for section_name, expected_types in zip(section_names, expected_types):
-        for i, expected_type in enumerate(expected_types):
-          node_name = section_name + '_%i' % i
-          output_port = 0
-          optimized_type = node_map[node_name].output_info[output_port].dtype
-          if optimized_type != expected_type:
-            print('Expected node %s to have type %s but got type %s' %
-                  (node_name, expected_type, optimized_type))
-            all_types_correct = False
-      self.assertTrue(all_types_correct)
+    section_names = ['input', 'while/body', 'output']
+    all_types_correct = True
+    for section_name, expected_types in zip(section_names, expected_types):
+      for i, expected_type in enumerate(expected_types):
+        node_name = section_name + '_%i' % i
+        output_port = 0
+        optimized_type = node_map[node_name].output_info[output_port].dtype
+        if optimized_type != expected_type:
+          print('Expected node %s to have type %s but got type %s' %
+                (node_name, expected_type, optimized_type))
+          all_types_correct = False
+    self.assertTrue(all_types_correct)
+    if mode == 'mkl':
+      self.assertAllClose(output_val_ref, output_val, atol=2e-2, rtol=2e-2)
+    else:
       self.assertAllClose(output_val_ref, output_val, atol=2e-3, rtol=1e-3)
 
+  @parameterized.parameters(['cuda', 'mkl'])
   @test_util.run_deprecated_v1
   @test_util.disable_xla('This test does not pass with XLA')
-  def test_conv_bn(self):
+  def test_conv_bn(self, mode):
     """Test graph with convolution followed by batch norm."""
-    if test.is_gpu_available(cuda_only=True):
-      random_seed.set_random_seed(0)
-      x = _input([2, 8, 8, 1])
-      x = _conv_bn(x)
-      output = _conv_bn(x)
+    self._maybe_skip(mode)
+    random_seed.set_random_seed(0)
+    x = _input([2, 8, 8, 1])
+    x = _conv_bn(x)
+    output = _conv_bn(x)
 
-      output_val_ref, output_val, cost_graph = self._run(output)
-      node_map = _build_node_map(cost_graph.node)
-      num_to_fp16, num_to_fp32 = _count_casts(cost_graph.node)
+    output_val_ref, output_val, cost_graph = self._run(mode, output)
+    node_map = _build_node_map(cost_graph.node)
+    num_to_f16, num_to_fp32 = _count_casts(mode, cost_graph.node)
 
-      self._assert_output_fp16(node_map, 'Conv2D')
-      self._assert_output_fp16(node_map, 'FusedBatchNormV3')
-      self._assert_output_fp16(node_map, 'Conv2D_1')
-      self.assertEqual(num_to_fp16,
-                       3)  # Before Conv2D:0, Conv2D:1, Conv2D_1:1
-      self.assertEqual(num_to_fp32, 1)  # After FusedBatchNormV3:0
+    self._assert_output_f16(mode, node_map, 'Conv2D')
+    self._assert_output_f16(mode, node_map, 'FusedBatchNormV3')
+    self._assert_output_f16(mode, node_map, 'Conv2D_1')
+    self.assertEqual(num_to_f16, 3)  # Before Conv2D:0, Conv2D:1, Conv2D_1:1
+    self.assertEqual(num_to_fp32, 1)  # After FusedBatchNormV3:0
+    if mode == 'mkl':
+      tol = 1e-2
+    elif test.is_built_with_rocm():
       # Bump up the tolerance for the ROCm platform
       # The default tolerance (1e-3) results in a tiny fraction (<1%) of
       # miscompares on ROCm platform, and hence the tolerance bump
-      tol = 2e-3 if test.is_built_with_rocm else 1e-3
-      self.assertAllClose(output_val_ref, output_val, atol=tol, rtol=tol)
+      tol = 2e-3
+    else:
+      tol = 1e-3
+    self.assertAllClose(output_val_ref, output_val, atol=tol, rtol=tol)
 
-  # TODO: enable these tests when cuDNN is upgraded to >= 7.6.2. Same with the
-  # test_conv3d() below.
-  @unittest.skip('Test case should be skipped when cuDNN < 7.6.2')
+  @parameterized.parameters(['cuda', 'mkl'])
   @test_util.run_deprecated_v1
   @test_util.disable_xla('This test does not pass with XLA')
-  def test_conv3d_bn(self):
+  def test_conv3d_bn(self, mode):
     """Test graph with convolution followed by batch norm."""
-    if test.is_gpu_available(cuda_only=True):
-      random_seed.set_random_seed(0)
-      x = _input([2, 8, 8, 8, 1])
-      x = _conv3d_bn(x)
-      output = _conv3d_bn(x)
+    self._maybe_skip(mode)
+    if mode == 'cuda':
+      # TODO(reedwm): enable these tests when cuDNN is upgraded to >= 7.6.2.
+      self.skipTest('Test case should be skipped when cuDNN < 7.6.2')
+    random_seed.set_random_seed(0)
+    x = _input([2, 8, 8, 8, 1])
+    x = _conv3d_bn(x)
+    output = _conv3d_bn(x)
 
-      output_val_ref, output_val, cost_graph = self._run(output)
-      node_map = _build_node_map(cost_graph.node)
-      num_to_fp16, num_to_fp32 = _count_casts(cost_graph.node)
+    output_val_ref, output_val, cost_graph = self._run(mode, output)
+    node_map = _build_node_map(cost_graph.node)
+    num_to_fp16, num_to_fp32 = _count_casts(mode, cost_graph.node)
 
-      self._assert_output_fp16(node_map, 'Conv3D')
-      self._assert_output_fp16(node_map, 'FusedBatchNormV3')
-      self._assert_output_fp16(node_map, 'Conv3D_1')
-      self.assertEqual(num_to_fp16, 3)  # Before Conv3D:0, Conv3D:1, Conv3D_1:1
-      self.assertEqual(num_to_fp32, 1)  # After FusedBatchNormV3:0
-      self.assertAllClose(output_val_ref, output_val, atol=1e-2, rtol=1e-2)
+    self._assert_output_f16(mode, node_map, 'Conv3D')
+    self._assert_output_f16(mode, node_map, 'FusedBatchNormV3')
+    self._assert_output_f16(mode, node_map, 'Conv3D_1')
+    self.assertEqual(num_to_fp16, 3)  # Before Conv3D:0, Conv3D:1, Conv3D_1:1
+    self.assertEqual(num_to_fp32, 1)  # After FusedBatchNormV3:0
+    self.assertAllClose(output_val_ref, output_val, atol=1e-2, rtol=1e-2)
 
-  @unittest.skip('Test case should be skipped when cuDNN < 7.6.2')
+  @parameterized.parameters(['cuda', 'mkl'])
   @test_util.run_deprecated_v1
   @test_util.disable_xla('This test does not pass with XLA')
-  def test_conv3d(self):
+  def test_conv3d(self, mode):
     """Test grad ops with convolution3d graph."""
-    if test.is_gpu_available(cuda_only=True):
-      random_seed.set_random_seed(0)
-      x = _input([2, 8, 8, 8, 1])
-      f = _weight([3, 3, 3, 1, 6])
-      y = _conv3d(x, f)
-      y = array_ops.identity(y)
-      optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=0.01)
-      g = optimizer.compute_gradients(y, [x, f])
-      output = (y, g)
+    self._maybe_skip(mode)
+    if mode == 'cuda':
+      # TODO(reedwm): enable these tests when cuDNN is upgraded to >= 7.6.2.
+      self.skipTest('Test case should be skipped when cuDNN < 7.6.2')
+    random_seed.set_random_seed(0)
+    x = _input([2, 8, 8, 8, 1])
+    f = _weight([3, 3, 3, 1, 6])
+    y = _conv3d(x, f)
+    y = array_ops.identity(y)
+    optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=0.01)
+    g = optimizer.compute_gradients(y, [x, f])
+    output = (y, g)
 
-      output_val_ref, output_val, cost_graph = self._run(output)
-      node_map = _build_node_map(cost_graph.node)
-      self._assert_output_fp16(node_map, 'Conv3D')
-      self._assert_output_fp16(node_map,
-                               'gradients/Conv3D_grad/Conv3DBackpropInputV2')
-      self._assert_output_fp16(node_map,
-                               'gradients/Conv3D_grad/Conv3DBackpropFilterV2')
+    output_val_ref, output_val, cost_graph = self._run(mode, output)
+    node_map = _build_node_map(cost_graph.node)
+    self._assert_output_f16(mode, node_map, 'Conv3D')
+    self._assert_output_f16(mode, node_map,
+                            'gradients/Conv3D_grad/Conv3DBackpropInputV2')
+    self._assert_output_f16(mode, node_map,
+                            'gradients/Conv3D_grad/Conv3DBackpropFilterV2')
 
-      output_val_ref, output_val, cost_graph = self._run(output)
-      self.assertAllClose(output_val_ref, output_val, atol=1e-3, rtol=1e-3)
+    output_val_ref, output_val, cost_graph = self._run(mode, output)
+    tol = 5e-2 if mode == 'mkl' else 1e-3
+    self.assertAllClose(output_val_ref, output_val, atol=tol, rtol=tol)
 
+  # TODO(reedwm): Fix and enable this test with MKL. Currently this crashes with
+  # MKL
+  @parameterized.parameters(['cuda'])
   @test_util.run_deprecated_v1
   @test_util.disable_xla('This test does not pass with XLA')
-  def test_conv_bn_dropout(self):
+  def test_conv_bn_dropout(self, mode):
     """Test dropout precision of convolution batch norm graph."""
-    if test.is_gpu_available(cuda_only=True):
-      random_seed.set_random_seed(0)
-      x = _input([2, 8, 8, 1])
-      y = _conv_bn(x)
-      y = nn.dropout(y, rate=0.5)
-      y = math_ops.add(y, 1, name='addition')
-      y = _conv_bn(y)
-      y = array_ops.identity(y)
-      optimizer = gradient_descent.GradientDescentOptimizer(
-          learning_rate=0.01)
-      g = optimizer.compute_gradients(y, [x])
-      output = (y, g)
+    self._maybe_skip(mode)
+    random_seed.set_random_seed(0)
+    x = _input([2, 8, 8, 1])
+    y = _conv_bn(x)
+    y = nn.dropout(y, rate=0.5)
+    y = math_ops.add(y, 1, name='addition')
+    y = _conv_bn(y)
+    y = array_ops.identity(y)
+    optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=0.01)
+    g = optimizer.compute_gradients(y, [x])
+    output = (y, g)
 
-      output_val_ref, output_val, cost_graph = self._run(output)
-      node_map = _build_node_map(cost_graph.node)
-      self._assert_output_fp16(node_map, 'Conv2D')
-      self._assert_output_fp16(node_map, 'FusedBatchNormV3')
-      # We do not assert dropout's dtype because we do not want to rely on the
-      # node names of dropout's internal implementation.
-      self._assert_output_fp16(node_map, 'addition')
-      self._assert_output_fp16(node_map, 'Conv2D_1')
+    output_val_ref, output_val, cost_graph = self._run(mode, output)
+    node_map = _build_node_map(cost_graph.node)
+    self._assert_output_f16(mode, node_map, 'Conv2D')
+    self._assert_output_f16(mode, node_map, 'FusedBatchNormV3')
+    # We do not assert dropout's dtype because we do not want to rely on the
+    # node names of dropout's internal implementation.
+    self._assert_output_f16(mode, node_map, 'addition')
+    self._assert_output_f16(mode, node_map, 'Conv2D_1')
 
-      output_val_ref, output_val, cost_graph = self._run(output)
-      # Bump up the tolerance for the ROCm platform
-      # The default tolerance (1e-3) results in a tiny fraction (<1%) of
-      # miscompares on ROCm platform, and hence the tolerance bump
-      tol = 2e-3 if test.is_built_with_rocm else 1e-3
-      self.assertAllClose(output_val_ref, output_val, atol=tol, rtol=tol)
+    output_val_ref, output_val, cost_graph = self._run(mode, output)
+    # Bump up the tolerance for the ROCm platform
+    # The default tolerance (1e-3) results in a tiny fraction (<1%) of
+    # miscompares on ROCm platform, and hence the tolerance bump
+    tol = 2e-3 if test.is_built_with_rocm else 1e-3
+    self.assertAllClose(output_val_ref, output_val, atol=tol, rtol=tol)
 
+  # TODO(reedwm): Fix and enable this test with MKL. Currently this crashes with
+  # MKL
+  @parameterized.parameters(['cuda'])
   @test_util.run_deprecated_v1
   @test_util.disable_xla('This test does not pass with XLA')
-  def test_conv_pool(self):
+  def test_conv_pool(self, mode):
     """Test graph with convolution followed by pooling."""
-    if test.is_gpu_available(cuda_only=True):
-      random_seed.set_random_seed(0)
-      x = _input([2, 8, 8, 1])
-      output = _conv_pool(x)
+    self._maybe_skip(mode)
+    random_seed.set_random_seed(0)
+    x = _input([2, 8, 8, 1])
+    output = _conv_pool(x)
 
-      output_val_ref, output_val, cost_graph = self._run(output)
-      node_map = _build_node_map(cost_graph.node)
-      num_to_fp16, num_to_fp32 = _count_casts(cost_graph.node)
+    output_val_ref, output_val, cost_graph = self._run(mode, output)
+    node_map = _build_node_map(cost_graph.node)
+    num_to_f16, num_to_fp32 = _count_casts(mode, cost_graph.node)
 
-      self._assert_output_fp16(node_map, 'Conv2D')
-      self._assert_output_fp16(node_map, 'Relu')
-      self._assert_output_fp16(node_map, 'MaxPool')
-      self._assert_output_fp16(node_map, 'Conv2D_1')
-      self.assertEqual(num_to_fp16, 4)
-      self.assertEqual(num_to_fp32, 1)
-      self.assertAllClose(output_val_ref, output_val, atol=1e-3, rtol=1e-3)
+    self._assert_output_f16(mode, node_map, 'Conv2D')
+    self._assert_output_f16(mode, node_map, 'Relu')
+    self._assert_output_f16(mode, node_map, 'MaxPool')
+    self._assert_output_f16(mode, node_map, 'Conv2D_1')
+    self.assertEqual(num_to_f16, 4)
+    self.assertEqual(num_to_fp32, 1)
+    tol = 5e-3 if mode == 'mkl' else 1e-3
+    self.assertAllClose(output_val_ref, output_val, atol=tol, rtol=tol)
 
+  @parameterized.parameters(['cuda', 'mkl'])
   @test_util.run_v1_only('b/138749235')
   @test_util.disable_xla('This test does not pass with XLA')
-  def test_simple_loop(self):
+  def test_simple_loop(self, mode):
     """Test graph with while loop."""
-    if test.is_gpu_available(cuda_only=True):
-      random_seed.set_random_seed(0)
-      x = _input([8, 8])
-      y = _simple_loop(x, _matmul_act)[1]
-      optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=0.01)
-      g = optimizer.compute_gradients(y, [x])
-      output = (y, g)
+    self._maybe_skip(mode)
+    random_seed.set_random_seed(0)
+    x = _input([8, 8])
+    y = _simple_loop(x, _matmul_act)[1]
+    optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=0.01)
+    g = optimizer.compute_gradients(y, [x])
+    output = (y, g)
 
-      output_val_ref, output_val, cost_graph = self._run(output)
-      node_map = _build_node_map(cost_graph.node)
+    output_val_ref, output_val, cost_graph = self._run(mode, output)
+    node_map = _build_node_map(cost_graph.node)
 
-      self._assert_output_fp16(node_map, 'while/MatMul')
-      self._assert_output_fp16(node_map, 'while/Relu')
-      self.assertAllClose(output_val_ref, output_val, atol=1e-3, rtol=1e-3)
+    self._assert_output_f16(mode, node_map, 'while/MatMul')
+    self._assert_output_f16(mode, node_map, 'while/Relu')
+    tol = 1e-2 if mode == 'mkl' else 1e-3
+    self.assertAllClose(output_val_ref, output_val, atol=tol, rtol=tol)
 
+  @parameterized.parameters(['cuda', 'mkl'])
   @test_util.run_v1_only('b/138749235')
   @test_util.disable_xla('This test does not pass with XLA')
-  def test_loop_with_vars_intertwined(self):
+  def test_loop_with_vars_intertwined(self, mode):
     """Test graph with intertwined while loops."""
-    if test.is_gpu_available(cuda_only=True):
-      random_seed.set_random_seed(0)
-      x = _input([8, 8])
-      _, _, k, l = _loop_vars_intertwined(
-          array_ops.ones(array_ops.shape(x)), x, _matmul_act, _matmul_act)
-      optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=0.01)
-      g = optimizer.compute_gradients(k, [x])
-      output = (k, l, g)
+    self._maybe_skip(mode)
+    random_seed.set_random_seed(0)
+    x = _input([8, 8])
+    _, _, k, l = _loop_vars_intertwined(
+        array_ops.ones(array_ops.shape(x)), x, _matmul_act, _matmul_act)
+    optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=0.01)
+    g = optimizer.compute_gradients(k, [x])
+    output = (k, l, g)
 
-      output_val_ref, output_val, cost_graph = self._run(output)
-      node_map = _build_node_map(cost_graph.node)
+    output_val_ref, output_val, cost_graph = self._run(mode, output)
+    node_map = _build_node_map(cost_graph.node)
 
-      self._assert_output_fp16(node_map, 'while/MatMul')
-      self._assert_output_fp16(node_map, 'while/Relu')
-      self._assert_output_fp16(node_map, 'while/MatMul_1')
-      self._assert_output_fp16(node_map, 'while/Relu_1')
-      self.assertAllClose(output_val_ref, output_val, atol=1e-3, rtol=1e-3)
+    self._assert_output_f16(mode, node_map, 'while/MatMul')
+    self._assert_output_f16(mode, node_map, 'while/Relu')
+    self._assert_output_f16(mode, node_map, 'while/MatMul_1')
+    self._assert_output_f16(mode, node_map, 'while/Relu_1')
+    tol = 5e-3 if mode == 'mkl' else 1e-3
+    self.assertAllClose(output_val_ref, output_val, atol=tol, rtol=tol)
 
+  @parameterized.parameters(['cuda'])
   @test_util.run_deprecated_v1
   @test_util.disable_xla('This test does not pass with XLA')
-  def test_multi_paths(self):
+  def test_multi_paths(self, mode):
     """Test graph with multiple paths."""
-    if test.is_gpu_available(cuda_only=True):
-      random_seed.set_random_seed(0)
-      x = _input([2, 8, 8, 3])
-      x1, x2, x3 = array_ops.split(x, num_or_size_splits=3, axis=3)
-      y1 = _conv_pool(x1)
-      y2 = _conv_pool(x2)
-      y3 = _conv_pool(x3)
-      y = array_ops.concat([y1, y2, y3], axis=3)
-      y = array_ops.identity(y)
-      optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=0.01)
-      g = optimizer.compute_gradients(y, [x])
-      output = (y, g)
+    self._maybe_skip(mode)
+    random_seed.set_random_seed(0)
+    x = _input([2, 8, 8, 3])
+    x1, x2, x3 = array_ops.split(x, num_or_size_splits=3, axis=3)
+    y1 = _conv_pool(x1)
+    y2 = _conv_pool(x2)
+    y3 = _conv_pool(x3)
+    y = array_ops.concat([y1, y2, y3], axis=3)
+    y = array_ops.identity(y)
+    optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=0.01)
+    g = optimizer.compute_gradients(y, [x])
+    output = (y, g)
 
-      output_val_ref, output_val, cost_graph = self._run(output)
-      node_map = _build_node_map(cost_graph.node)
+    output_val_ref, output_val, cost_graph = self._run(mode, output)
+    node_map = _build_node_map(cost_graph.node)
 
-      self._assert_output_fp16(node_map, 'split')
-      for suffix in [''] + ['_%i' % i for i in range(1, 6)]:
-        self._assert_output_fp16(node_map, 'Conv2D' + suffix)
-        self._assert_output_fp16(node_map, 'Relu' + suffix)
-        self._assert_output_fp16(node_map, 'MaxPool' + suffix)
-      self._assert_output_fp16(node_map, 'concat')
-      self.assertAllClose(output_val_ref, output_val, atol=1e-3, rtol=1e-3)
+    self._assert_output_f16(mode, node_map, 'split')
+    for suffix in [''] + ['_%i' % i for i in range(1, 6)]:
+      self._assert_output_f16(mode, node_map, 'Conv2D' + suffix)
+      self._assert_output_f16(mode, node_map, 'Relu' + suffix)
+      self._assert_output_f16(mode, node_map, 'MaxPool' + suffix)
+    self._assert_output_f16(mode, node_map, 'concat')
+    self.assertAllClose(output_val_ref, output_val, atol=1e-3, rtol=1e-3)
 
+  @parameterized.parameters(['cuda', 'mkl'])
   @test_util.run_deprecated_v1
   @test_util.disable_xla('This test does not pass with XLA')
-  def test_multi_paths_2(self):
+  def test_multi_paths_2(self, mode):
     """Test graph with multiple paths."""
-    if test.is_gpu_available(cuda_only=True):
-      random_seed.set_random_seed(0)
-      x = _input([8, 8])
-      y1 = _matmul_act(x)
-      y2 = _matmul_act(x)
-      y = y1 + y2 + x
-      optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=0.01)
-      g = optimizer.compute_gradients(y, [x])
-      output = (g, y)
+    self._maybe_skip(mode)
+    random_seed.set_random_seed(0)
+    x = _input([8, 8])
+    y1 = _matmul_act(x)
+    y2 = _matmul_act(x)
+    y = y1 + y2 + x
+    optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=0.01)
+    g = optimizer.compute_gradients(y, [x])
+    output = (g, y)
 
-      output_val_ref, output_val, cost_graph = self._run(output)
-      node_map = _build_node_map(cost_graph.node)
+    output_val_ref, output_val, cost_graph = self._run(mode, output)
+    node_map = _build_node_map(cost_graph.node)
 
-      self._assert_output_fp16(node_map, 'MatMul')
-      self._assert_output_fp16(node_map, 'Relu')
-      self._assert_output_fp16(node_map, 'MatMul_1')
-      self._assert_output_fp16(node_map, 'Relu_1')
+    self._assert_output_f16(mode, node_map, 'MatMul')
+    self._assert_output_f16(mode, node_map, 'Relu')
+    self._assert_output_f16(mode, node_map, 'MatMul_1')
+    self._assert_output_f16(mode, node_map, 'Relu_1')
+    if mode == 'mkl':
+      tol = 2e-2
+    elif test.is_built_with_rocm():
       # Bump up the tolerance for the ROCm platform
       # The default tolerance (1e-3) results in a tiny fraction (<1%) of
       # miscompares on ROCm platform, and hence the tolerance bump
-      tol = 2e-3 if test.is_built_with_rocm else 1e-3
-      self.assertAllClose(output_val_ref, output_val, atol=tol, rtol=tol)
+      tol = 2e-3
+    else:
+      tol = 1e-3
+    self.assertAllClose(output_val_ref, output_val, atol=tol, rtol=tol)
 
+  @parameterized.parameters(['cuda'])  # MKL doesn't support bf16 Sigmoid
   @test_util.run_v1_only('b/138749235')
   @test_util.disable_xla('This test does not pass with XLA')
-  def test_recurrent_lstm(self):
+  def test_recurrent_lstm(self, mode):
     """Test graph with recurrent lstm."""
-    if test.is_gpu_available(cuda_only=True):
-      random_seed.set_random_seed(0)
-      init_c = _input([8, 4])
-      init_h = _input([8, 4])
-      _, _, h, _ = _recurrent_lstm(init_c, init_h)
-      optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=0.01)
-      g = optimizer.compute_gradients(h, [init_c, init_h])
-      output = (h, g)
+    self._maybe_skip(mode)
+    random_seed.set_random_seed(0)
+    init_c = _input([8, 4])
+    init_h = _input([8, 4])
+    _, _, h, _ = _recurrent_lstm(init_c, init_h)
+    optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=0.01)
+    g = optimizer.compute_gradients(h, [init_c, init_h])
+    output = (h, g)
 
-      output_val_ref, output_val, cost_graph = self._run(output)
-      node_map = _build_node_map(cost_graph.node)
+    output_val_ref, output_val, cost_graph = self._run(mode, output)
+    node_map = _build_node_map(cost_graph.node)
 
-      self._assert_output_fp16(node_map, 'while/concat')
-      self._assert_output_fp16(node_map, 'while/MatMul')
-      self._assert_output_fp16(node_map, 'while/split')
-      self._assert_output_fp16(node_map, 'while/Sigmoid')
-      self._assert_output_fp16(node_map, 'while/Sigmoid_1')
-      self._assert_output_fp16(node_map, 'while/Sigmoid_2')
-      self._assert_output_fp16(node_map, 'while/Tanh')
-      self._assert_output_fp16(node_map, 'while/Tanh_1')
-      self.assertAllClose(output_val_ref, output_val, atol=1e-3, rtol=1e-3)
+    self._assert_output_f16(mode, node_map, 'while/concat')
+    self._assert_output_f16(mode, node_map, 'while/MatMul')
+    self._assert_output_f16(mode, node_map, 'while/split')
+    self._assert_output_f16(mode, node_map, 'while/Sigmoid')
+    self._assert_output_f16(mode, node_map, 'while/Sigmoid_1')
+    self._assert_output_f16(mode, node_map, 'while/Sigmoid_2')
+    self._assert_output_f16(mode, node_map, 'while/Tanh')
+    self._assert_output_f16(mode, node_map, 'while/Tanh_1')
+    self.assertAllClose(output_val_ref, output_val, atol=1e-3, rtol=1e-3)
 
+  @parameterized.parameters(['cuda', 'mkl'])
   @test_util.run_v1_only('v1 loop test')
   @test_util.disable_xla('This test does not pass with XLA')
-  def test_propagation_through_simple_loop_1(self):
-    self._run_simple_loop_test('W', 'C', 'C')
+  def test_propagation_through_simple_loop_1(self, mode):
+    self._run_simple_loop_test(mode, 'W', 'C', 'C')
 
+  @parameterized.parameters(['cuda', 'mkl'])
   @test_util.run_v1_only('v1 loop test')
   @test_util.disable_xla('This test does not pass with XLA')
-  def test_propagation_through_simple_loop_2(self):
-    self._run_simple_loop_test('C', 'C', 'W')
+  def test_propagation_through_simple_loop_2(self, mode):
+    self._run_simple_loop_test(mode, 'C', 'C', 'W')
 
+  @parameterized.parameters(['cuda', 'mkl'])
   @test_util.run_v1_only('v1 loop test')
   @test_util.disable_xla('This test does not pass with XLA')
-  def test_propagation_through_simple_loop_3(self):
-    self._run_simple_loop_test('W', 'G', 'W')
+  def test_propagation_through_simple_loop_3(self, mode):
+    self._run_simple_loop_test(mode, 'W', 'G', 'W')
 
+  @parameterized.parameters(['cuda', 'mkl'])
   @test_util.run_v1_only('v1 loop test')
   @test_util.disable_xla('This test does not pass with XLA')
-  def test_propagation_through_simple_loop_4(self):
-    self._run_simple_loop_test('W', 'gbg', 'W')
+  def test_propagation_through_simple_loop_4(self, mode):
+    self._run_simple_loop_test(mode, 'W', 'gbg', 'W')
 
+  @parameterized.parameters(['cuda', 'mkl'])
   @test_util.run_v1_only('b/138749235')
   @test_util.disable_xla('This test does not pass with XLA')
-  def test_propagation_through_simple_loop_5(self):
-    self._run_simple_loop_test('b', 'gWC', 'c')
+  def test_propagation_through_simple_loop_5(self, mode):
+    self._run_simple_loop_test(mode, 'b', 'gWC', 'c')
 
+  @parameterized.parameters(['cuda', 'mkl'])
   @test_util.run_v1_only('b/138749235')
   @test_util.disable_xla('This test does not pass with XLA')
-  def test_propagation_through_simple_loop_6(self):
-    self._run_simple_loop_test('b', 'CWCG', 'C')
+  def test_propagation_through_simple_loop_6(self, mode):
+    self._run_simple_loop_test(mode, 'b', 'CWCG', 'C')
 
+  @parameterized.parameters(['cuda', 'mkl'])
   @test_util.run_v1_only('b/138749235')
   @test_util.disable_xla('This test does not pass with XLA')
-  def test_propagation_through_simple_loop_7(self):
-    self._run_simple_loop_test('C', 'GWCG', 'C')
+  def test_propagation_through_simple_loop_7(self, mode):
+    self._run_simple_loop_test(mode, 'C', 'GWCG', 'C')
 
+  @parameterized.parameters(['cuda', 'mkl'])
   @test_util.run_v1_only('b/138749235')
   @test_util.disable_xla('This test does not pass with XLA')
-  def test_propagation_through_simple_loop_8(self):
-    self._run_simple_loop_test('C', 'CgbgWC', 'g')
+  def test_propagation_through_simple_loop_8(self, mode):
+    self._run_simple_loop_test(mode, 'C', 'CgbgWC', 'g')
 
+  @parameterized.parameters(['cuda', 'mkl'])
   @test_util.run_deprecated_v1
   @test_util.disable_xla('This test does not pass with XLA')
-  def test_noninlined_funcdef(self):
+  def test_noninlined_funcdef(self, mode):
     """Test graph with non-inlined function subgraph.
 
     This requires the grappler pass to handle an OpDef that only appears in the
     graph's function registry instead of the global op registry.
+
+    Args:
+      mode: Either 'cuda' or 'mkl'.
     """
-    if test.is_gpu_available(cuda_only=True):
-      random_seed.set_random_seed(0)
-      x = _input([8, 8])
-      y = _matmul_act(x)
-      y = _example_noninlined_funcdef(y)
-      optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=0.01)
-      g = optimizer.compute_gradients(y, [x])
-      output = (g, y)
+    self._maybe_skip(mode)
+    random_seed.set_random_seed(0)
+    x = _input([8, 8])
+    y = _matmul_act(x)
+    y = _example_noninlined_funcdef(y)
+    optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=0.01)
+    g = optimizer.compute_gradients(y, [x])
+    output = (g, y)
 
-      output_val_ref, output_val, cost_graph = self._run(output)
-      node_map = _build_node_map(cost_graph.node)
+    output_val_ref, output_val, cost_graph = self._run(mode, output)
+    node_map = _build_node_map(cost_graph.node)
 
-      self._assert_output_fp16(node_map, 'MatMul')
-      self.assertAllClose(output_val_ref, output_val, atol=1e-3, rtol=1e-3)
+    self._assert_output_f16(mode, node_map, 'MatMul')
+    tol = 1e-2 if mode == 'mkl' else 1e-3
+    self.assertAllClose(output_val_ref, output_val, atol=tol, rtol=tol)
 
+  @parameterized.parameters(['cuda', 'mkl'])
   @test_util.run_deprecated_v1
   @test_util.disable_xla('This test does not pass with XLA')
-  def test_ingraph_train_loop(self):
+  def test_ingraph_train_loop(self, mode):
     """Tests a graph containing a while loop around a training update.
 
     This requires the grappler pass to take special care with its handling of
     Enter ops that appear in front of reads from non-resource variables. See
     the use of NodeImplicitlyReadsVariable in auto_mixed_precision.cc.
+
+    Args:
+      mode: Either 'cuda' or 'mkl'.
     """
+    self._maybe_skip(mode)
     if tf2.enabled():
       # This test tests non-resource variables, which are only used in TF1.
       self.skipTest('TensorFlow 1 required')
-    if test.is_gpu_available(cuda_only=True):
-      random_seed.set_random_seed(1234)
-      np.random.seed(1234)
-      num_iter, bs, nchan, nclass = 100, 64, 32, 100
+    random_seed.set_random_seed(1234)
+    np.random.seed(1234)
+    num_iter, bs, nchan, nclass = 100, 64, 32, 100
 
-      data = np.random.normal(size=(bs * num_iter, nchan)).astype(np.float32)
-      labels = np.random.randint(nclass, size=(bs * num_iter,))
-      ds = dataset_ops.Dataset.from_tensor_slices((data, labels))
-      ds = ds.batch(bs).prefetch(3)
-      it = ds.make_one_shot_iterator()
+    data = np.random.normal(size=(bs * num_iter, nchan)).astype(np.float32)
+    labels = np.random.randint(nclass, size=(bs * num_iter,))
+    ds = dataset_ops.Dataset.from_tensor_slices((data, labels))
+    ds = ds.batch(bs).prefetch(3)
+    it = ds.make_one_shot_iterator()
 
-      def body(_, i):
-        i += 1
-        x, yt = it.get_next()
-        dense = layers.Dense(nclass)
-        y = dense(x)
-        loss = losses.sparse_softmax_cross_entropy(yt, y)
-        opt = adam.AdamOptimizer()
-        train_op = opt.minimize(loss, var_list=dense.trainable_weights)
-        with ops.control_dependencies([train_op]):
-          loss = array_ops.identity(loss)
-        return loss, i
+    def body(_, i):
+      i += 1
+      x, yt = it.get_next()
+      dense = layers.Dense(nclass)
+      y = dense(x)
+      loss = losses.sparse_softmax_cross_entropy(yt, y)
+      opt = adam.AdamOptimizer()
+      train_op = opt.minimize(loss, var_list=dense.trainable_weights)
+      with ops.control_dependencies([train_op]):
+        loss = array_ops.identity(loss)
+      return loss, i
 
-      begin, end = constant_op.constant(0), constant_op.constant(num_iter)
-      loss, _ = control_flow_ops.while_loop(
-          lambda loss, i: math_ops.less(i, end), body, [0.0, begin])
+    begin, end = constant_op.constant(0), constant_op.constant(num_iter)
+    loss, _ = control_flow_ops.while_loop(lambda loss, i: math_ops.less(i, end),
+                                          body, [0.0, begin])
 
-      output_val_ref, output_val, cost_graph = self._run(loss)
-      node_map = _build_node_map(cost_graph.node)
+    output_val_ref, output_val, cost_graph = self._run(mode, loss)
+    node_map = _build_node_map(cost_graph.node)
 
-      self._assert_output_fp16(node_map, 'while/dense/MatMul')
-      self._assert_output_fp16(
-          node_map, 'while/gradients/while/dense/MatMul_grad/MatMul_1')
-      self.assertAllClose(output_val_ref, output_val, atol=1e-3, rtol=1e-3)
+    self._assert_output_f16(mode, node_map, 'while/dense/MatMul')
+    self._assert_output_f16(mode, node_map,
+                            'while/gradients/while/dense/MatMul_grad/MatMul_1')
+    self.assertAllClose(output_val_ref, output_val, atol=1e-3, rtol=1e-3)
 
   # TODO(benbarsdell): Add tests for list ops (TensorList*) that pass through
   # graph source/sink nodes, similar to the TensorListThroughFunction C++ test.
diff --git a/tensorflow/python/grappler/model_analyzer.cc b/tensorflow/python/grappler/model_analyzer.cc
index 5a76cdd8fb2..250010c0fed 100644
--- a/tensorflow/python/grappler/model_analyzer.cc
+++ b/tensorflow/python/grappler/model_analyzer.cc
@@ -48,7 +48,7 @@ void ModelAnalyzer::PrintNodeInfo(const NodeDef* node,
   if (properties.HasOutputProperties(node->name())) {
     const std::vector<OpInfo::TensorProperties>& props =
         properties.GetOutputProperties(node->name());
-    for (int i = 0; i < props.size(); ++i) {
+    for (int i = 0, props_size = props.size(); i < props_size; ++i) {
       const OpInfo::TensorProperties& prop = props[i];
       os << "\t"
          << "output " << i << " (" << DataTypeString(prop.dtype())
@@ -88,7 +88,7 @@ void ModelAnalyzer::PrintNodeInfo(const NodeDef* node,
     } else if (properties.HasInputProperties(node->name())) {
       const std::vector<OpInfo::TensorProperties>& props =
           properties.GetInputProperties(node->name());
-      for (int i = 0; i < props.size(); ++i) {
+      for (int i = 0, props_size = props.size(); i < props_size; ++i) {
         const OpInfo::TensorProperties& prop = props[i];
         if (prop.has_value()) {
           os << "\t"
diff --git a/tensorflow/python/keras/applications/efficientnet.py b/tensorflow/python/keras/applications/efficientnet.py
index e1413b08533..e3c1a261e80 100644
--- a/tensorflow/python/keras/applications/efficientnet.py
+++ b/tensorflow/python/keras/applications/efficientnet.py
@@ -334,7 +334,7 @@ def EfficientNet(
   blocks_args = copy.deepcopy(blocks_args)
 
   b = 0
-  blocks = float(sum(args['repeats'] for args in blocks_args))
+  blocks = float(sum(round_repeats(args['repeats']) for args in blocks_args))
   for (i, args) in enumerate(blocks_args):
     assert args['repeats'] > 0
     # Update block input and output filters based on depth multiplier.
diff --git a/tensorflow/python/keras/backend.py b/tensorflow/python/keras/backend.py
index 391c695b18f..9330425272f 100644
--- a/tensorflow/python/keras/backend.py
+++ b/tensorflow/python/keras/backend.py
@@ -4637,12 +4637,6 @@ def softsign(x):
   return nn.softsign(x)
 
 
-def _backtrack_identity(tensor):
-  while tensor.op.type == 'Identity':
-    tensor = tensor.op.inputs[0]
-  return tensor
-
-
 @keras_export('keras.backend.categorical_crossentropy')
 @dispatch.add_dispatch_support
 def categorical_crossentropy(target, output, from_logits=False, axis=-1):
@@ -4695,17 +4689,16 @@ def categorical_crossentropy(target, output, from_logits=False, axis=-1):
     return nn.softmax_cross_entropy_with_logits_v2(
         labels=target, logits=output, axis=axis)
 
-  if not isinstance(output, (ops.EagerTensor, variables_module.Variable)):
-    output = _backtrack_identity(output)
-    if output.op.type == 'Softmax':
-      # When softmax activation function is used for output operation, we
-      # use logits from the softmax function directly to compute loss in order
-      # to prevent collapsing zero when training.
-      # See b/117284466
-      assert len(output.op.inputs) == 1
-      output = output.op.inputs[0]
-      return nn.softmax_cross_entropy_with_logits_v2(
-          labels=target, logits=output, axis=axis)
+  if (not isinstance(output, (ops.EagerTensor, variables_module.Variable)) and
+      output.op.type == 'Softmax'):
+    # When softmax activation function is used for output operation, we
+    # use logits from the softmax function directly to compute loss in order
+    # to prevent collapsing zero when training.
+    # See b/117284466
+    assert len(output.op.inputs) == 1
+    output = output.op.inputs[0]
+    return nn.softmax_cross_entropy_with_logits_v2(
+        labels=target, logits=output, axis=axis)
 
   # scale preds so that the class probas of each sample sum to 1
   output = output / math_ops.reduce_sum(output, axis, True)
@@ -4740,17 +4733,16 @@ def sparse_categorical_crossentropy(target, output, from_logits=False, axis=-1):
   target = ops.convert_to_tensor_v2(target)
   output = ops.convert_to_tensor_v2(output)
 
-  if not from_logits and not isinstance(
-      output, (ops.EagerTensor, variables_module.Variable)):
-    output = _backtrack_identity(output)
-    if output.op.type == 'Softmax':
-      # When softmax activation function is used for output operation, we
-      # use logits from the softmax function directly to compute loss in order
-      # to prevent collapsing zero when training.
-      # See b/117284466
-      assert len(output.op.inputs) == 1
-      output = output.op.inputs[0]
-      from_logits = True
+  if (not from_logits and
+      not isinstance(output, (ops.EagerTensor, variables_module.Variable)) and
+      output.op.type == 'Softmax'):
+    # When softmax activation function is used for output operation, we
+    # use logits from the softmax function directly to compute loss in order
+    # to prevent collapsing zero when training.
+    # See b/117284466
+    assert len(output.op.inputs) == 1
+    output = output.op.inputs[0]
+    from_logits = True
 
   if not from_logits:
     epsilon_ = _constant_to_tensor(epsilon(), output.dtype.base_dtype)
@@ -4821,15 +4813,14 @@ def binary_crossentropy(target, output, from_logits=False):
   if from_logits:
     return nn.sigmoid_cross_entropy_with_logits(labels=target, logits=output)
 
-  if not isinstance(output, (ops.EagerTensor, variables_module.Variable)):
-    output = _backtrack_identity(output)
-    if output.op.type == 'Sigmoid':
-      # When sigmoid activation function is used for output operation, we
-      # use logits from the sigmoid function directly to compute loss in order
-      # to prevent collapsing zero when training.
-      assert len(output.op.inputs) == 1
-      output = output.op.inputs[0]
-      return nn.sigmoid_cross_entropy_with_logits(labels=target, logits=output)
+  if (not isinstance(output, (ops.EagerTensor, variables_module.Variable)) and
+      output.op.type == 'Sigmoid'):
+    # When sigmoid activation function is used for output operation, we
+    # use logits from the sigmoid function directly to compute loss in order
+    # to prevent collapsing zero when training.
+    assert len(output.op.inputs) == 1
+    output = output.op.inputs[0]
+    return nn.sigmoid_cross_entropy_with_logits(labels=target, logits=output)
 
   epsilon_ = _constant_to_tensor(epsilon(), output.dtype.base_dtype)
   output = clip_ops.clip_by_value(output, epsilon_, 1. - epsilon_)
diff --git a/tensorflow/python/keras/callbacks.py b/tensorflow/python/keras/callbacks.py
index 1bca5419774..138a682c739 100644
--- a/tensorflow/python/keras/callbacks.py
+++ b/tensorflow/python/keras/callbacks.py
@@ -54,7 +54,9 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import summary_ops_v2
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.profiler import profiler_v2 as profiler
+from tensorflow.python.saved_model import save_options as save_options_lib
 from tensorflow.python.training import checkpoint_management
+from tensorflow.python.training.saving import checkpoint_options as checkpoint_options_lib
 from tensorflow.python.util import nest
 from tensorflow.python.util.compat import collections_abc
 from tensorflow.python.util.tf_export import keras_export
@@ -418,8 +420,9 @@ class CallbackList(object):
 
     Arguments:
         batch: Integer, index of batch within the current epoch.
-        logs: Dict. Has keys `batch` and `size` representing the current batch
-          number and the size of the batch.
+        logs: Dict, contains the return value of `model.train_step`. Typically,
+          the values of the `Model`'s metrics are returned.  Example:
+          `{'loss': 0.2, 'accuracy': 0.7}`.
     """
     # TODO(b/150629188): Make ProgBarLogger callback not use batch hooks
     # when verbose != 1
@@ -441,8 +444,9 @@ class CallbackList(object):
 
     Arguments:
         batch: Integer, index of batch within the current epoch.
-        logs: Dict. Has keys `batch` and `size` representing the current batch
-          number and the size of the batch.
+        logs: Dict, contains the return value of `model.test_step`. Typically,
+          the values of the `Model`'s metrics are returned.  Example:
+          `{'loss': 0.2, 'accuracy': 0.7}`.
     """
     if self._should_call_test_batch_hooks:
       self._call_batch_hook(ModeKeys.TEST, 'begin', batch, logs=logs)
@@ -462,8 +466,9 @@ class CallbackList(object):
 
     Arguments:
         batch: Integer, index of batch within the current epoch.
-        logs: Dict. Has keys `batch` and `size` representing the current batch
-          number and the size of the batch.
+        logs: Dict, contains the return value of `model.predict_step`,
+          it typically returns a dict with a key 'outputs' containing
+          the model's outputs.
     """
     if self._should_call_predict_batch_hooks:
       self._call_batch_hook(ModeKeys.PREDICT, 'begin', batch, logs=logs)
@@ -1115,6 +1120,9 @@ class ModelCheckpoint(Callback):
         epochs, the monitored metric may potentially be less reliable (it
         could reflect as little as 1 batch, since the metrics get reset every
         epoch). Defaults to `'epoch'`.
+      options: Optional `tf.train.CheckpointOptions` object if
+        `save_weights_only` is true or optional `tf.saved_model.SavedOptions`
+        object if `save_weights_only` is false.
       **kwargs: Additional arguments for backwards compatibility. Possible key
         is `period`.
   """
@@ -1127,6 +1135,7 @@ class ModelCheckpoint(Callback):
                save_weights_only=False,
                mode='auto',
                save_freq='epoch',
+               options=None,
                **kwargs):
     super(ModelCheckpoint, self).__init__()
     self._supports_tf_logs = True
@@ -1140,6 +1149,20 @@ class ModelCheckpoint(Callback):
     self._batches_seen_since_last_saving = 0
     self._last_batch_seen = 0
 
+    if save_weights_only:
+      if options is None or isinstance(
+          options, checkpoint_options_lib.CheckpointOptions):
+        self._options = options or checkpoint_options_lib.CheckpointOptions()
+      else:
+        raise TypeError('If save_weights_only is True, then `options` must be'
+                        'either None or a tf.train.CheckpointOptions')
+    else:
+      if options is None or isinstance(options, save_options_lib.SaveOptions):
+        self._options = options or save_options_lib.SaveOptions()
+      else:
+        raise TypeError('If save_weights_only is False, then `options` must be'
+                        'either None or a tf.saved_model.SaveOptions')
+
     # Deprecated field `load_weights_on_restart` is for loading the checkpoint
     # file from `filepath` at the start of `model.fit()`
     # TODO(rchao): Remove the arg during next breaking release.
@@ -1269,9 +1292,10 @@ class ModelCheckpoint(Callback):
                                                self.best, current, filepath))
               self.best = current
               if self.save_weights_only:
-                self.model.save_weights(filepath, overwrite=True)
+                self.model.save_weights(
+                    filepath, overwrite=True, options=self._options)
               else:
-                self.model.save(filepath, overwrite=True)
+                self.model.save(filepath, overwrite=True, options=self._options)
             else:
               if self.verbose > 0:
                 print('\nEpoch %05d: %s did not improve from %0.5f' %
@@ -1280,9 +1304,10 @@ class ModelCheckpoint(Callback):
           if self.verbose > 0:
             print('\nEpoch %05d: saving model to %s' % (epoch + 1, filepath))
           if self.save_weights_only:
-            self.model.save_weights(filepath, overwrite=True)
+            self.model.save_weights(
+                filepath, overwrite=True, options=self._options)
           else:
-            self.model.save(filepath, overwrite=True)
+            self.model.save(filepath, overwrite=True, options=self._options)
 
         self._maybe_remove_file()
       except IOError as e:
diff --git a/tensorflow/python/keras/callbacks_test.py b/tensorflow/python/keras/callbacks_test.py
index 28f85304688..d180e85a1d9 100644
--- a/tensorflow/python/keras/callbacks_test.py
+++ b/tensorflow/python/keras/callbacks_test.py
@@ -49,9 +49,11 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import summary_ops_v2
 from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.saved_model import save_options as save_options_lib
 from tensorflow.python.summary import summary_iterator
 from tensorflow.python.training import adam
 from tensorflow.python.training import checkpoint_management
+from tensorflow.python.training.saving import checkpoint_options as checkpoint_options_lib
 
 try:
   import h5py  # pylint:disable=g-import-not-at-top
@@ -666,6 +668,38 @@ class KerasCallbacksTest(keras_parameterized.TestCase):
         mode=mode,
         save_freq=3)
 
+    # Case 9: `ModelCheckpoint` with valid and invalid `options` argument.
+    with self.assertRaisesRegexp(TypeError, 'tf.train.CheckpointOptions'):
+      keras.callbacks.ModelCheckpoint(
+          filepath,
+          monitor=monitor,
+          save_best_only=save_best_only,
+          save_weights_only=True,
+          mode=mode,
+          options=save_options_lib.SaveOptions())
+    with self.assertRaisesRegexp(TypeError, 'tf.saved_model.SaveOptions'):
+      keras.callbacks.ModelCheckpoint(
+          filepath,
+          monitor=monitor,
+          save_best_only=save_best_only,
+          save_weights_only=False,
+          mode=mode,
+          options=checkpoint_options_lib.CheckpointOptions())
+    keras.callbacks.ModelCheckpoint(
+        filepath,
+        monitor=monitor,
+        save_best_only=save_best_only,
+        save_weights_only=True,
+        mode=mode,
+        options=checkpoint_options_lib.CheckpointOptions())
+    keras.callbacks.ModelCheckpoint(
+        filepath,
+        monitor=monitor,
+        save_best_only=save_best_only,
+        save_weights_only=False,
+        mode=mode,
+        options=save_options_lib.SaveOptions())
+
   def _get_dummy_resource_for_model_checkpoint_testing(self):
 
     def get_input_datasets():
diff --git a/tensorflow/python/keras/datasets/imdb.py b/tensorflow/python/keras/datasets/imdb.py
index 37403228edf..e359d691a5d 100644
--- a/tensorflow/python/keras/datasets/imdb.py
+++ b/tensorflow/python/keras/datasets/imdb.py
@@ -124,20 +124,24 @@ def load_data(path='imdb.npz',
   x_test = x_test[indices]
   labels_test = labels_test[indices]
 
-  xs = np.concatenate([x_train, x_test])
-  labels = np.concatenate([labels_train, labels_test])
-
   if start_char is not None:
-    xs = [[start_char] + [w + index_from for w in x] for x in xs]
+    x_train = [[start_char] + [w + index_from for w in x] for x in x_train]
+    x_test = [[start_char] + [w + index_from for w in x] for x in x_test]
   elif index_from:
-    xs = [[w + index_from for w in x] for x in xs]
+    x_train = [[w + index_from for w in x] for x in x_train]
+    x_test = [[w + index_from for w in x] for x in x_test]
 
   if maxlen:
-    xs, labels = _remove_long_seq(maxlen, xs, labels)
-    if not xs:
+    x_train, labels_train = _remove_long_seq(maxlen, x_train, labels_train)
+    x_test, labels_test = _remove_long_seq(maxlen, x_test, labels_test)
+    if not x_train or not x_test:
       raise ValueError('After filtering for sequences shorter than maxlen=' +
                        str(maxlen) + ', no sequence was kept. '
                        'Increase maxlen.')
+
+  xs = np.concatenate([x_train, x_test])
+  labels = np.concatenate([labels_train, labels_test])
+
   if not num_words:
     num_words = max(max(x) for x in xs)
 
diff --git a/tensorflow/python/keras/distribute/BUILD b/tensorflow/python/keras/distribute/BUILD
index ddf274f299f..c6a8f2c5f91 100644
--- a/tensorflow/python/keras/distribute/BUILD
+++ b/tensorflow/python/keras/distribute/BUILD
@@ -324,6 +324,51 @@ cuda_py_test(
     ],
 )
 
+cuda_py_test(
+    name = "mirrored_strategy_test",
+    srcs = ["mirrored_strategy_test.py"],
+    python_version = "PY3",
+    tags = [
+        "multi_and_single_gpu",
+        "no_windows_gpu",  # TODO(b/130551176)
+    ],
+    deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:training_lib",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/distribute:combinations",
+        "//tensorflow/python/distribute:strategy_combinations",
+        "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:function",
+        "//tensorflow/python/eager:test",
+        "//tensorflow/python/keras/engine",
+        "//tensorflow/python/keras/layers:core",
+    ],
+)
+
+cuda_py_test(
+    name = "mirrored_variable_test",
+    srcs = ["mirrored_variable_test.py"],
+    python_version = "PY3",
+    tags = [
+        "guitar",
+        "multi_and_single_gpu",
+    ],
+    deps = [
+        "//tensorflow/python:config",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/distribute:collective_all_reduce_strategy",
+        "//tensorflow/python/distribute:combinations",
+        "//tensorflow/python/distribute:distribute_lib",
+        "//tensorflow/python/distribute:strategy_combinations",
+        "//tensorflow/python/distribute:values",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:test",
+        "//tensorflow/python/keras/layers:core",
+    ],
+)
+
 cuda_py_test(
     name = "multi_worker_test",
     srcs = ["multi_worker_test.py"],
diff --git a/tensorflow/python/keras/distribute/mirrored_strategy_test.py b/tensorflow/python/keras/distribute/mirrored_strategy_test.py
new file mode 100644
index 00000000000..2844af8cc3a
--- /dev/null
+++ b/tensorflow/python/keras/distribute/mirrored_strategy_test.py
@@ -0,0 +1,89 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for MirroredStrategy."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import strategy_combinations
+from tensorflow.python.eager import backprop
+from tensorflow.python.eager import context
+from tensorflow.python.eager import function
+from tensorflow.python.eager import test
+from tensorflow.python.keras.engine import training as keras_training
+from tensorflow.python.keras.layers import core as keras_core
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.training import gradient_descent
+from tensorflow.python.training import optimizer as optimizer_lib
+
+
+class MiniModel(keras_training.Model):
+  """Minimal model for mnist.
+
+  Useful for testing and debugging on slow TPU simulators.
+  """
+
+  def __init__(self):
+    super(MiniModel, self).__init__(name="")
+    self.fc = keras_core.Dense(1, name="fc", kernel_initializer="ones",
+                               bias_initializer="ones")
+
+  def call(self, inputs, training=True):
+    inputs = array_ops.ones([1, 10])
+    return self.fc(inputs)
+
+
+@combinations.generate(
+    combinations.combine(
+        distribution=[
+            strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
+        ],
+        mode=["graph", "eager"]))
+class MirroredStrategyDefunTest(test.TestCase):
+
+  def testTrain(self, distribution):
+    with distribution.scope():
+      mock_model = MiniModel()
+      mock_model.call = function.defun(mock_model.call)
+
+      def loss_fn(ctx):
+        del ctx
+        return mock_model(array_ops.ones([1, 10]))
+
+      gradients_fn = backprop.implicit_grad(loss_fn)
+      gradients_fn = optimizer_lib.get_filtered_grad_fn(gradients_fn)
+      grads_and_vars = distribution.extended.call_for_each_replica(
+          gradients_fn, args=(None,))
+
+      optimizer = gradient_descent.GradientDescentOptimizer(0.25)
+      update_ops = optimizer._distributed_apply(distribution, grads_and_vars)  # pylint: disable=protected-access
+
+      if not context.executing_eagerly():
+        self.evaluate(variables.global_variables_initializer())
+        self.evaluate(update_ops)
+
+      updated_var_values = self.evaluate(mock_model.variables)
+      # All variables start at 1.0 and get two updates of 0.25.
+      self.assertAllEqual(0.5 * np.ones([10, 1]), updated_var_values[0])
+      self.assertAllEqual([0.5], updated_var_values[1])
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/keras/distribute/mirrored_variable_test.py b/tensorflow/python/keras/distribute/mirrored_variable_test.py
new file mode 100644
index 00000000000..0edfa4806f2
--- /dev/null
+++ b/tensorflow/python/keras/distribute/mirrored_variable_test.py
@@ -0,0 +1,106 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Test MirroredVariable in MirroredStrategy and MultiWorkerMirroredStrategy."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.distribute import collective_all_reduce_strategy
+from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import distribution_strategy_context as ds_context
+from tensorflow.python.distribute import strategy_combinations
+from tensorflow.python.distribute import values
+from tensorflow.python.eager import context
+from tensorflow.python.eager import test
+from tensorflow.python.framework import config
+from tensorflow.python.keras.layers import core
+
+
+def _mimic_two_cpus():
+  cpus = config.list_physical_devices("CPU")
+
+  config.set_logical_device_configuration(cpus[0], [
+      context.LogicalDeviceConfiguration(),
+      context.LogicalDeviceConfiguration(),
+  ])
+
+
+@combinations.generate(
+    combinations.combine(
+        distribution=[
+            strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
+            combinations.NamedDistribution(
+                "Collective2CPUs",
+                # pylint: disable=g-long-lambda
+                lambda: collective_all_reduce_strategy.
+                CollectiveAllReduceStrategy._from_local_devices((
+                    "/device:CPU:0", "/device:CPU:1")),
+                required_gpus=0)
+        ],
+        mode=["graph", "eager"]))
+class MirroredVariableCreationTest(test.TestCase):
+  """Base class that tests mirrored variable creator.
+
+  Currently it assumes all strategy objects have two replicas.
+  """
+
+  @classmethod
+  def setUpClass(cls):
+    _mimic_two_cpus()
+
+  def assertAllDifferent(self, objs):
+    for i in range(len(objs)):
+      for j in range(len(objs)):
+        if i == j:
+          continue
+        self.assertIsNot(objs[i], objs[j])
+
+  def testWithLayers(self, distribution):
+
+    def model_fn(features):
+
+      layer1 = core.Dense(1)
+      layer1(features)
+      layer2 = core.Dense(1)
+      layer2(features)
+      # We rely on names and orders to make sure replica references the same
+      # MirroredVariable. Uniquifying names may involve global states,
+      # merge_call switches threads so we need to test things work after
+      # merge_call.
+      ds_context.get_replica_context().merge_call(lambda _: _)
+      layer3 = core.Dense(1)
+      layer3(features)
+      return [(layer1.kernel, layer1.bias), (layer2.kernel, layer2.bias),
+              (layer3.kernel, layer3.bias)]
+
+    iterator = distribution.make_input_fn_iterator(
+        lambda _: dataset_ops.Dataset.from_tensors([[1.]]).repeat(10))
+    self.evaluate(iterator.initializer)
+    features = iterator.get_next()
+
+    with distribution.scope():
+      result = distribution.extended.call_for_each_replica(
+          model_fn, args=(features,))
+      for kernel, bias in result:
+        self.assertIsInstance(kernel, values.MirroredVariable)
+        self.assertAllDifferent(distribution.experimental_local_results(kernel))
+        self.assertIsInstance(bias, values.MirroredVariable)
+        self.assertAllDifferent(distribution.experimental_local_results(kernel))
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/keras/distribute/multi_worker_tutorial_test.py b/tensorflow/python/keras/distribute/multi_worker_tutorial_test.py
index 3f9ab18f89c..aafa00bcc9f 100644
--- a/tensorflow/python/keras/distribute/multi_worker_tutorial_test.py
+++ b/tensorflow/python/keras/distribute/multi_worker_tutorial_test.py
@@ -19,6 +19,7 @@ from __future__ import print_function
 import contextlib
 import os
 import re
+import zipfile
 from absl.testing import parameterized
 import numpy as np
 from tensorflow.python import keras
@@ -43,6 +44,8 @@ class MultiWorkerTutorialTest(parameterized.TestCase, test.TestCase):
   def skip_fetch_failure_exception(self):
     try:
       yield
+    except zipfile.BadZipfile as e:
+      self.skipTest('Data loading error: Bad magic number for file header.')
     except Exception as e:  # pylint: disable=broad-except
       if 'URL fetch failure' in str(e):
         self.skipTest('URL fetch error not considered failure of the test.')
diff --git a/tensorflow/python/keras/engine/base_layer.py b/tensorflow/python/keras/engine/base_layer.py
index a0ee25417c0..97eb0447a69 100644
--- a/tensorflow/python/keras/engine/base_layer.py
+++ b/tensorflow/python/keras/engine/base_layer.py
@@ -40,7 +40,6 @@ from tensorflow.python.eager import context
 from tensorflow.python.eager import execute
 from tensorflow.python.eager import function
 from tensorflow.python.eager import monitoring
-from tensorflow.python.framework import auto_control_deps
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
@@ -944,10 +943,14 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
       kwargs['mask'] = input_masks
 
     # Training mode for `Layer.call` is set via (in order of priority):
-    # (1) The `training` argument passed to this `Layer.call`.
+    # (1) The `training` argument passed to this `Layer.call`, if it is not None
     # (2) The training mode of an outer `Layer.call`.
-    # (3) The default mode set by `tf.keras.backed.set_learning_phase` (if set).
-    training_mode = self._set_training_mode(args, kwargs, call_context)
+    # (3) The default mode set by `tf.keras.backend.set_learning_phase` (if set)
+    # (4) Any non-None default value for `training` specified in the call
+    #  signature
+    # (5) False (treating the layer as if it's in inference)
+    args, kwargs, training_mode = self._set_training_mode(
+        args, kwargs, call_context)
 
     # Losses are cleared for all sublayers on the outermost `Layer.call`.
     # Losses are not cleared on inner `Layer.call`s, because sublayers can be
@@ -1021,7 +1024,7 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
     # propagate `training` value from this layer's calling layer.
     training_value = None
     training_arg_passed_by_framework = False
-    # Priority 1: `training` was explicitly passed.
+    # Priority 1: `training` was explicitly passed a non-None value.
     if self._call_arg_was_passed('training', args, kwargs):
       training_value = self._get_call_arg_value('training', args, kwargs)
       if not self._expects_training_arg:
@@ -1031,17 +1034,23 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
       # Priority 2: `training` was passed to a parent layer.
       if call_context.training is not None:
         training_value = call_context.training
-      # Priority 3a: `learning_phase()` has been set.
+      # Priority 3: `learning_phase()` has been set.
       elif backend.global_learning_phase_is_set():
         training_value = backend.learning_phase()
-
-      if self._expects_training_arg and training_value is not None:
         # Force the training_value to be bool type which matches to the contract
         # for layer/model call args.
         if tensor_util.is_tensor(training_value):
           training_value = math_ops.cast(training_value, dtypes.bool)
         else:
           training_value = bool(training_value)
+      # Priority 4: trace layer with the default training argument specified
+      # in the `call` signature (or in inference mode if the `call` signature
+      # specifies no non-None default).
+      else:
+        training_value = self._default_training_arg
+      # In cases (2), (3), (4) the training argument is passed automatically
+      # by the framework, and will not be hard-coded into the model.
+      if self._expects_training_arg:
         args, kwargs = self._set_call_arg_value('training', training_value,
                                                 args, kwargs)
         training_arg_passed_by_framework = True
@@ -1105,17 +1114,7 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
 
           try:
             with ops.enable_auto_cast_variables(self._compute_dtype_object):
-              # Add auto_control_deps in V2 when they are not already added by
-              # a `tf.function`.
-              if (ops.executing_eagerly_outside_functions() and
-                  not base_layer_utils.is_in_eager_or_tf_function()):
-                with auto_control_deps.AutomaticControlDependencies() as acd:
-                  outputs = call_fn(cast_inputs, *args, **kwargs)
-                  # Wrap Tensors in `outputs` in `tf.identity` to avoid
-                  # circular dependencies.
-                  outputs = base_layer_utils.mark_as_return(outputs, acd)
-              else:
-                outputs = call_fn(cast_inputs, *args, **kwargs)
+              outputs = call_fn(cast_inputs, *args, **kwargs)
 
           except errors.OperatorNotAllowedInGraphError as e:
             raise TypeError('You are attempting to use Python control '
@@ -1161,6 +1160,8 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
       # (1) `training` was passed to this `Layer.call`.
       if self._call_arg_was_passed('training', args, kwargs):
         training_mode = self._get_call_arg_value('training', args, kwargs)
+      # If no `training` arg was passed, or `None` was explicitly passed,
+      # the framework will make a decision about the training mode is.
       if training_mode is None:
         call_ctx_training = call_context.training
         # (2) `training` mode is inferred from an outer `Layer.call`.
@@ -1176,10 +1177,15 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
             training_mode = math_ops.cast(training_mode, dtypes.bool)
           else:
             training_mode = bool(training_mode)
+        # (4) We default to using `call`'s default value for `training`,
+        # or treating the layer as if it is in inference if no non-None default
+        # is specified in the `call` signature.
+        else:
+          training_mode = self._default_training_arg
 
-        # For case (2) or (3), `training` arg is passed by framework.
-        if training_mode is not None:
-          kwargs['training'] = training_mode
+        # For case (2), (3), (4) `training` arg is passed by framework.
+        args, kwargs = self._set_call_arg_value('training', training_mode, args,
+                                                kwargs)
     else:
       if 'training' in kwargs:
         # `training` was passed to this `Layer` but is not needed for
@@ -1189,7 +1195,7 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
         # Grab the current `training` mode from any outer `Layer.call`.
         training_mode = call_context.training
 
-    return training_mode
+    return args, kwargs, training_mode
 
   def _autographed_call(self):
     # Wrapping `call` function in autograph to allow for dynamic control
@@ -1733,54 +1739,15 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
       inputs: Deprecated, will be automatically inferred.
     """
     call_context = base_layer_utils.call_context()
-
-    if (ds_context.has_strategy() and
-        ds_context.in_cross_replica_context() and
-        # When saving the model, the distribution strategy context should be
-        # ignored, following the default path for adding updates.
-        not call_context.saving):
-      # Updates don't need to be run in a cross-replica context.
+    # No need to run updates during Functional API construction.
+    if call_context.in_keras_graph:
       return
 
-    updates = generic_utils.to_list(updates)
-
-    # All updates can be run immediately in Eager or in a tf.function.
-    if base_layer_utils.is_in_eager_or_tf_function():
-      if not call_context.frozen:
-        for update in updates:
-          if callable(update):
-            update()
-      return
-
-    def process_update(x):
-      """Standardize update ops.
-
-      Arguments:
-        x: Tensor, op, or callable.
-
-      Returns:
-        An update op.
-      """
-      if callable(x):
-        update = lambda: process_update(x())
-        if not ops.executing_eagerly_outside_functions():
-          # In V1 mode, call the callable right away and process. This is needed
-          # for TPU strategy.
-          return update()
-      elif isinstance(x, ops.Operation):
-        update = x
-      elif hasattr(x, 'op'):
-        update = x.op
-      else:
-        update = ops.convert_to_tensor_v2(x)
-      return update
-
-    updates = [process_update(x) for x in updates]
-    # Non-callable Updates are run automatically inside `call` in V2, so
-    # they do not need to be tracked later.
-    if ops.executing_eagerly_outside_functions() and call_context.in_call:
-      updates = [u for u in updates if callable(u)]
-    self._updates.extend(updates)
+    # Callable updates are disabled by setting `trainable=False`.
+    if not call_context.frozen:
+      for update in nest.flatten(updates):
+        if callable(update):
+          update()
 
   def set_weights(self, weights):
     """Sets the weights of the layer, from Numpy arrays.
@@ -2579,7 +2546,7 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
       if len(args) > arg_pos:
         args = list(args)
         args[arg_pos] = new_value
-        return args, kwargs
+        return tuple(args), kwargs
     if new_value is None and pop_kwarg_if_none:
       kwargs.pop(arg_name, None)
     else:
@@ -2923,6 +2890,10 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
     call_fn_args = self._call_fn_args
     self._expects_training_arg = ('training' in call_fn_args or
                                   self._call_accepts_kwargs)
+    # The default training arg will be any (non-None) default specified in the
+    # method signature, or `False` if no non-None default is specified.
+    self._default_training_arg = self._call_fn_arg_defaults.get(
+        'training') or False
     self._expects_mask_arg = ('mask' in call_fn_args or
                               self._call_accepts_kwargs)
 
@@ -2942,6 +2913,19 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
       return all_args[1:]
     return all_args
 
+  @property
+  @tracking.cached_per_instance
+  def _call_fn_arg_defaults(self):
+    call_fn_args = self._call_fn_args
+    call_fn_defaults = self._call_full_argspec.defaults or []
+    defaults = dict()
+
+    # The call arg defaults are an n-tuple of the last n elements of the args
+    # list. (n = # of elements that have a default argument)
+    for i in range(-1 * len(call_fn_defaults), 0):
+      defaults[call_fn_args[i]] = call_fn_defaults[i]
+    return defaults
+
   @property
   @tracking.cached_per_instance
   def _call_fn_arg_positions(self):
diff --git a/tensorflow/python/keras/engine/base_layer_test.py b/tensorflow/python/keras/engine/base_layer_test.py
index b861d7e4b5b..58a0799329a 100644
--- a/tensorflow/python/keras/engine/base_layer_test.py
+++ b/tensorflow/python/keras/engine/base_layer_test.py
@@ -629,6 +629,96 @@ class BaseLayerTest(keras_parameterized.TestCase):
     self.assertTrue(layer.built)
     self.assertEqual([None, 3], layer._build_input_shape.as_list())
 
+  @combinations.generate(combinations.combine(mode=['eager']))
+  def custom_layer_training_arg(self):
+    class CustomLayerNoTrainingArg(base_layer.Layer):
+
+      def __init__(self, nested_layer=None):
+        self._nested_layer = nested_layer or array_ops.identity
+
+      def call(self, inputs):
+        return self._nested_layer(inputs)
+
+    class CustomLayerDefaultTrainingMissing(base_layer.Layer):
+
+      def __init__(self, nested_layer=None):
+        self._nested_layer = nested_layer or array_ops.identity
+
+      def call(self, inputs, training):
+        if training:
+          return self._nested_layer(inputs)
+        else:
+          return self._nested_layer(inputs) * 0.5
+
+    class CustomLayerDefaultTrainingFalse(base_layer.Layer):
+
+      def __init__(self, nested_layer=None):
+        self._nested_layer = nested_layer or array_ops.identity
+
+      def call(self, inputs, training=False):
+        if training:
+          return self._nested_layer(inputs)
+        else:
+          return self._nested_layer(inputs) * 0.5
+
+    class CustomLayerDefaultTrainingTrue(base_layer.Layer):
+
+      def __init__(self, nested_layer=None):
+        self._nested_layer = nested_layer or array_ops.identity
+
+      def call(self, inputs, training=True):
+        if training:
+          return self._nested_layer(inputs)
+        else:
+          return self._nested_layer(inputs) * 0.5
+
+    x = array_ops.ones(shape=(1, 1))
+
+    # If the layer signature doesn't specify a default training arg,
+    # run it in inference mode when to training arg is passed
+    # to __call__
+    layer = CustomLayerDefaultTrainingMissing()
+    self.assertAllEqual(layer(x), x * 0.5)
+    self.assertAllEqual(layer(x, training=False), x * 0.5)
+    self.assertAllEqual(layer(x, training=True), x)
+
+    # If the layer signature specifies `False` as the default training arg,
+    # run it in inference mode when no training arg is passed
+    # to __call__
+    layer = CustomLayerDefaultTrainingFalse()
+    self.assertAllEqual(layer(x), x * 0.5)
+    self.assertAllEqual(layer(x, training=False), x * 0.5)
+    self.assertAllEqual(layer(x, training=True), x)
+
+    # If the layer signature specifies `True` as the default training arg,
+    # explicitly run it in training mode when no training arg is passed
+    # to __call__
+    layer = CustomLayerDefaultTrainingTrue()
+    self.assertAllEqual(layer(x), x)
+    self.assertAllEqual(layer(x, training=False), x * 0.5)
+    self.assertAllEqual(layer(x, training=True), x)
+
+    # Outer layers/models should set the training context implicitly for all
+    # nested layers, respecting whatever mode the outer layer was run with.
+    layer = CustomLayerDefaultTrainingTrue(CustomLayerDefaultTrainingFalse())
+    self.assertAllEqual(layer(x), x)
+    self.assertAllEqual(layer(x, training=False), x * 0.25)
+    self.assertAllEqual(layer(x, training=True), x)
+
+    layer = CustomLayerDefaultTrainingFalse(CustomLayerDefaultTrainingTrue())
+    self.assertAllEqual(layer(x), x * 0.25)
+    self.assertAllEqual(layer(x, training=False), x * 0.25)
+    self.assertAllEqual(layer(x, training=True), x)
+
+    # If the outer layer `call` doesn't take a training argument at all,
+    # it'll set the nested scope as inference when no training arg is passed in.
+    # If a training arg is passed in it won't use it directly in `call`, but
+    # it will set the nested training mode.
+    layer = CustomLayerNoTrainingArg(CustomLayerDefaultTrainingTrue())
+    self.assertAllEqual(layer(x), x * 0.5)
+    self.assertAllEqual(layer(x, training=False), x * 0.5)
+    self.assertAllEqual(layer(x, training=True), x)
+
   def test_activity_regularizer_string(self):
 
     class MyLayer(base_layer.Layer):
@@ -1387,6 +1477,7 @@ class DTypeTest(keras_parameterized.TestCase):
     class IdentityLayerWithArgs(base_layer.Layer):
 
       def call(self, inputs, *args, **kwargs):
+        kwargs.pop('training', None)
         return nest.flatten([inputs, args, kwargs])
 
     layer = IdentityLayerWithArgs(dtype='float64')
diff --git a/tensorflow/python/keras/engine/base_preprocessing_layer.py b/tensorflow/python/keras/engine/base_preprocessing_layer.py
index 08df07e33e3..c8ba1229ff5 100644
--- a/tensorflow/python/keras/engine/base_preprocessing_layer.py
+++ b/tensorflow/python/keras/engine/base_preprocessing_layer.py
@@ -185,12 +185,21 @@ class CombinerPreprocessingLayer(PreprocessingLayer):
       if not self.built:
         try:
           # If this is a Numpy array or tensor, we can get shape from .shape.
-          # If not, an attribute error will be thrown (and we can assume the
-          # input data is a scalar with shape None.
-          shape = data_element.shape
+          # If not, an attribute error will be thrown.
+          data_shape = data_element.shape
+          data_shape_nones = tuple([None]*len(data_element.shape))
         except AttributeError:
-          shape = None
-        self.build(shape)
+          # The input has an unknown number of dimensions.
+          data_shape = None
+          data_shape_nones = None
+
+        # TODO (b/159261555): move this to base layer build.
+        batch_input_shape = getattr(self, '_batch_input_shape', None)
+        if batch_input_shape is None:
+          # Set the number of dimensions.
+          self._batch_input_shape = data_shape_nones
+
+        self.build(data_shape)
 
       # Once we have built the Layer, we can process the input data. We do so
       # until we've gotten an exception indicating that we have no more data.
diff --git a/tensorflow/python/keras/engine/base_preprocessing_layer_test.py b/tensorflow/python/keras/engine/base_preprocessing_layer_test.py
index 70d088cf3d3..a3a36a9bf11 100644
--- a/tensorflow/python/keras/engine/base_preprocessing_layer_test.py
+++ b/tensorflow/python/keras/engine/base_preprocessing_layer_test.py
@@ -122,11 +122,11 @@ class AddingPreprocessingLayerV1(
   pass
 
 
-def get_layer():
+def get_layer(**kwargs):
   if context.executing_eagerly():
-    return AddingPreprocessingLayer()
+    return AddingPreprocessingLayer(**kwargs)
   else:
-    return AddingPreprocessingLayerV1()
+    return AddingPreprocessingLayerV1(**kwargs)
 
 
 @keras_parameterized.run_all_keras_modes
@@ -366,6 +366,38 @@ class PreprocessingLayerTest(keras_parameterized.TestCase):
     with self.assertRaisesRegex(RuntimeError, "Unable to restore a layer of"):
       _ = keras.models.load_model(output_path)
 
+  def test_adapt_sets_input_shape_rank(self):
+    """Check that `.adapt()` sets the `input_shape`'s rank."""
+    # Shape: (3,1,2)
+    adapt_dataset = np.array([[[1., 2.]],
+                              [[3., 4.]],
+                              [[5., 6.]]], dtype=np.float32)
+
+    layer = get_layer()
+    layer.adapt(adapt_dataset)
+
+    input_dataset = np.array([[[1., 2.], [3., 4.]],
+                              [[3., 4.], [5., 6.]]], dtype=np.float32)
+    layer(input_dataset)
+
+    model = keras.Sequential([layer])
+    self.assertTrue(model.built)
+    self.assertEqual(model.input_shape, (None, None, None))
+
+  def test_adapt_doesnt_overwrite_input_shape(self):
+    """Check that `.adapt()` doesn't change the `input_shape`."""
+    # Shape: (3, 1, 2)
+    adapt_dataset = np.array([[[1., 2.]],
+                              [[3., 4.]],
+                              [[5., 6.]]], dtype=np.float32)
+
+    layer = get_layer(input_shape=[1, 2])
+    layer.adapt(adapt_dataset)
+
+    model = keras.Sequential([layer])
+    self.assertTrue(model.built)
+    self.assertEqual(model.input_shape, (None, 1, 2))
+
 
 @keras_parameterized.run_all_keras_modes
 class ConvertToListTest(keras_parameterized.TestCase):
diff --git a/tensorflow/python/keras/engine/compile_utils.py b/tensorflow/python/keras/engine/compile_utils.py
index 3858639f024..ba7ce624090 100644
--- a/tensorflow/python/keras/engine/compile_utils.py
+++ b/tensorflow/python/keras/engine/compile_utils.py
@@ -37,7 +37,7 @@ class Container(object):
   def __init__(self, output_names=None):
     self._output_names = output_names
 
-  def _build(self, y_pred):
+  def build(self, y_pred):
     if self._output_names is None:
       # In Subclass API, output names like 'output_1' are used for
       # `Metric` names.
@@ -131,9 +131,9 @@ class LossesContainer(Container):
     ]
     return [self._loss_metric] + per_output_metrics
 
-  def _build(self, y_pred):
+  def build(self, y_pred):
     """One-time setup of loss objects."""
-    super(LossesContainer, self)._build(y_pred)
+    super(LossesContainer, self).build(y_pred)
 
     self._losses = self._maybe_broadcast_to_outputs(y_pred, self._losses)
     self._losses = self._conform_to_outputs(y_pred, self._losses)
@@ -184,7 +184,7 @@ class LossesContainer(Container):
     sample_weight = self._conform_to_outputs(y_pred, sample_weight)
 
     if not self._built:
-      self._build(y_pred)
+      self.build(y_pred)
 
     y_pred = nest.flatten(y_pred)
     y_true = nest.flatten(y_true)
@@ -295,9 +295,9 @@ class MetricsContainer(Container):
       return []
     return self._metrics_in_order
 
-  def _build(self, y_pred, y_true):
+  def build(self, y_pred, y_true):
     """One-time setup of metric objects."""
-    super(MetricsContainer, self)._build(y_pred)
+    super(MetricsContainer, self).build(y_pred)
 
     self._metrics = self._maybe_broadcast_to_outputs(y_pred, self._metrics)
     self._metrics = self._conform_to_outputs(y_pred, self._metrics)
@@ -385,7 +385,7 @@ class MetricsContainer(Container):
     sample_weight = self._conform_to_outputs(y_pred, sample_weight)
 
     if not self._built:
-      self._build(y_pred, y_true)
+      self.build(y_pred, y_true)
 
     y_pred = nest.flatten(y_pred)
     y_true = nest.flatten(y_true) if y_true is not None else []
diff --git a/tensorflow/python/keras/engine/data_adapter.py b/tensorflow/python/keras/engine/data_adapter.py
index 469355dd722..29a99137982 100644
--- a/tensorflow/python/keras/engine/data_adapter.py
+++ b/tensorflow/python/keras/engine/data_adapter.py
@@ -19,7 +19,6 @@ from __future__ import division
 from __future__ import print_function
 
 import abc
-import collections
 import contextlib
 import functools
 import itertools
@@ -57,7 +56,6 @@ try:
   from scipy import sparse as scipy_sparse  # pylint: disable=g-import-not-at-top
 except ImportError:
   scipy_sparse = None
-
 try:
   import pandas as pd  # pylint: disable=g-import-not-at-top
 except ImportError:
@@ -786,7 +784,6 @@ class GeneratorDataAdapter(DataAdapter):
     # Since we have to know the dtype of the python generator when we build the
     # dataset, we have to look at a batch to infer the structure.
     peek, x = self._peek_and_restore(x)
-    assert_not_namedtuple(peek)
     peek = self._standardize_batch(peek)
     peek = _process_tensorlike(peek)
 
@@ -1070,21 +1067,6 @@ def broadcast_sample_weight_modes(target_structure, sample_weight_modes):
   return sample_weight_modes
 
 
-def assert_not_namedtuple(x):
-  if (isinstance(x, tuple) and
-      # TODO(b/144192902): Use a namedtuple checking utility.
-      hasattr(x, "_fields") and
-      isinstance(x._fields, collections.Sequence) and
-      all(isinstance(f, six.string_types) for f in x._fields)):
-    raise ValueError(
-        "Received namedtuple ({}) with fields `{}` as input. namedtuples "
-        "cannot, in general, be unambiguously resolved into `x`, `y`, "
-        "and `sample_weight`. For this reason Keras has elected not to "
-        "support them. If you would like the value to be unpacked, "
-        "please explicitly convert it to a tuple before passing it to "
-        "Keras.".format(x.__class__, x._fields))
-
-
 class DataHandler(object):
   """Handles iterating over epoch-level `tf.data.Iterator` objects."""
 
diff --git a/tensorflow/python/keras/engine/functional.py b/tensorflow/python/keras/engine/functional.py
index 0ef4840b651..fd80e7f8bb4 100644
--- a/tensorflow/python/keras/engine/functional.py
+++ b/tensorflow/python/keras/engine/functional.py
@@ -22,6 +22,7 @@ from __future__ import print_function
 import collections
 import copy
 import itertools
+import warnings
 
 from six.moves import zip  # pylint: disable=redefined-builtin
 
@@ -131,10 +132,10 @@ class Functional(training_lib.Model):
 
     # Models constructed with a single Tensor or list of Tensors can
     # be called with a dict, where the keys of the dict are the names
-    # of the `Input` objects. Extra keys are ignored.
+    # of the `Input` objects. Extra keys are ignored with warning.
     self._enable_dict_to_input_mapping = (
         not nest.is_sequence(self._nested_inputs) or
-        (isinstance(self._nested_inputs, (list, tuple)) and
+        (isinstance(self._nested_inputs, (list, tuple, dict)) and
          not any(nest.is_sequence(t) for t in self._nested_inputs)))
 
     if any(not hasattr(tensor, '_keras_history') for tensor in self.outputs):
@@ -524,10 +525,27 @@ class Functional(training_lib.Model):
       ref_inputs = self._nested_inputs
       if not nest.is_sequence(ref_inputs):
         ref_inputs = [self._nested_inputs]
+      if isinstance(ref_inputs, dict):
+        # In the case that the graph is constructed with dict input tensors,
+        # We will use the original dict key to map with the keys in the input
+        # data. Note that the model.inputs is using nest.flatten to process the
+        # input tensors, which means the dict input tensors are ordered by their
+        # keys.
+        ref_input_names = sorted(ref_inputs.keys())
+      else:
+        ref_input_names = [inp._keras_history.layer.name for inp in ref_inputs]
+
+      # Raise an warning if there are more input data comparing to input tensor
+      if len(tensors) > len(ref_input_names):
+        warnings.warn(
+            'Input dict contained keys {} which did not match any model input. '
+            'They will be ignored by the model.'.format(
+                [n for n in tensors.keys() if n not in ref_input_names])
+            )
 
       try:
         # Flatten in the order `Input`s were passed during Model construction.
-        return [tensors[inp._keras_history.layer.name] for inp in ref_inputs]
+        return [tensors[n] for n in ref_input_names]
       except KeyError:
         # TODO(b/151582614)
         return nest.flatten(tensors)
@@ -1007,10 +1025,12 @@ def _map_subgraph_network(inputs, outputs):
 
 def _should_skip_first_node(layer):
   """Returns True if the first layer node should not be saved or loaded."""
-  # Networks start with a pre-existing node linking their input to output.
-  # For a sequential model, it is first created with _is_graph_network = False,
-  # we have to keep the _is_graph_network check here.
-  return isinstance(layer, Functional) and layer._is_graph_network
+  # Networks that are constructed with an Input layer/shape start with a
+  # pre-existing node linking their input to output. This node is excluded from
+  # the network config.
+  return (isinstance(layer, Functional) and
+          # Filter out Sequential models without an input shape.
+          isinstance(layer._layers[0], input_layer_module.InputLayer))
 
 
 def _deserialize_keras_tensors(kwargs, layer_map):
diff --git a/tensorflow/python/keras/engine/functional_test.py b/tensorflow/python/keras/engine/functional_test.py
index a7e314d4a49..0e82d95d3de 100644
--- a/tensorflow/python/keras/engine/functional_test.py
+++ b/tensorflow/python/keras/engine/functional_test.py
@@ -18,8 +18,11 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import warnings
+
 import numpy as np
 
+
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.framework import constant_op
@@ -43,6 +46,7 @@ from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import string_ops
 from tensorflow.python.ops.ragged import ragged_factory_ops
 from tensorflow.python.platform import test
 from tensorflow.python.training.tracking.util import Checkpoint
@@ -1565,6 +1569,48 @@ class DefaultShapeInferenceBehaviorTest(keras_parameterized.TestCase):
     self.assertEqual(config['layers'][2]['inbound_nodes'],
                      [[['in1', 0, 0, {}], ['in2', 0, 0, {}]]])
 
+  @combinations.generate(combinations.combine(mode=['eager']))
+  def test_dict_inputs_tensors(self):
+    # Note that this test is running with v2 eager only, since the v1
+    # will behave differently wrt to dict input for training.
+    inputs = {
+        'sentence2': input_layer_lib.Input(
+            shape=(), name='a', dtype=dtypes.string),
+        'sentence1': input_layer_lib.Input(
+            shape=(), name='b', dtype=dtypes.string),
+    }
+    strlen = layers.Lambda(string_ops.string_length_v2)
+    diff = layers.Subtract()(
+        [strlen(inputs['sentence1']), strlen(inputs['sentence2'])])
+    diff = math_ops.cast(diff, dtypes.float32)
+    model = training_lib.Model(inputs, diff)
+
+    extra_keys = {
+        'sentence1': constant_op.constant(['brown fox', 'lazy dog']),
+        'sentence2': constant_op.constant(['owl', 'cheeky cat']),
+        'label': constant_op.constant([0, 1]),
+    }
+
+    with warnings.catch_warnings(record=True) as w:
+      warnings.simplefilter('always')
+      model(extra_keys)
+      self.assertIn('ignored by the model', str(w[-1].message))
+
+    model.compile('sgd', 'mse')
+    with warnings.catch_warnings(record=True) as w:
+      warnings.simplefilter('always')
+      model.fit(extra_keys, y=constant_op.constant([0, 1]), steps_per_epoch=1)
+      self.assertIn('ignored by the model', str(w[-1].message))
+
+    with warnings.catch_warnings(record=True) as w:
+      warnings.simplefilter('always')
+      model.evaluate(extra_keys, constant_op.constant([0, 1]))
+      self.assertIn('ignored by the model', str(w[-1].message))
+
+    # Make sure the model inputs are sorted with the dict keys.
+    self.assertEqual(model.inputs[0]._keras_history.layer.name, 'b')
+    self.assertEqual(model.inputs[1]._keras_history.layer.name, 'a')
+
 
 class GraphUtilsTest(test.TestCase):
 
@@ -2036,43 +2082,73 @@ class CacheCorrectnessTest(keras_parameterized.TestCase):
 
   def test_training_passed_during_construction(self):
 
+    def _call(inputs, training):
+      if training is None:
+        return inputs * -1.0
+      elif training:
+        return inputs
+      else:
+        return inputs * 0.0
+
     class MyLayer(base_layer.Layer):
 
-      def call(self, x, training=None):
-        if training is None:
-          return x * -1.0
-        elif training:
-          return x
-        else:
-          return x * 0.0
+      def call(self, inputs, training=True):
+        return _call(inputs, training)
 
     my_layer = MyLayer()
     x = np.ones((1, 10))
 
+    # Hard-coded `true` value passed during construction is respected.
     inputs = input_layer_lib.Input(10)
     outputs = my_layer(inputs, training=True)
     network = functional.Functional(inputs, outputs)
+    self.assertAllEqual(network(x, training=True), _call(x, True))
+    self.assertAllEqual(network(x, training=False), _call(x, True))
+    self.assertAllEqual(network(x), _call(x, True))
 
-    # Hard-coded value passed during construction is respected.
-    self.assertAllEqual(network(x, training=False), x)
-
+    # Hard-coded `false` value passed during construction is respected.
     inputs = input_layer_lib.Input(10)
     outputs = my_layer(inputs, training=False)
     network = functional.Functional(inputs, outputs)
+    self.assertAllEqual(network(x, training=True), _call(x, False))
+    self.assertAllEqual(network(x, training=False), _call(x, False))
+    self.assertAllEqual(network(x), _call(x, False))
 
-    network(x, training=True)
-    # Hard-coded value passed during construction is respected.
-    self.assertAllEqual(network(x, training=True), x * 0.0)
+    if context.executing_eagerly():
+      # In v2, construction still works when no `training` is specified
+      # When no value passed during construction, it uses the runtime value.
+      inputs = input_layer_lib.Input(10)
+      outputs = my_layer(inputs)
+      network = functional.Functional(inputs, outputs)
+      self.assertAllEqual(network(x, training=True), _call(x, True))
+      self.assertAllEqual(network(x, training=False), _call(x, False))
+      self.assertAllEqual(network(x), _call(x, False))
 
+    # `None` value passed positionally during construction is ignored at runtime
+    inputs = input_layer_lib.Input(10)
+    outputs = my_layer(inputs, None)
+    network = functional.Functional(inputs, outputs)
+    self.assertAllEqual(network(x, training=True), _call(x, True))
+    self.assertAllEqual(network(x, training=False), _call(x, False))
+    if context.executing_eagerly():
+      self.assertAllEqual(network(x), _call(x, False))
+    else:
+      # in v1 training would have defaulted to using the `None` inside the layer
+      # if training is not passed at runtime
+      self.assertAllEqual(network(x), _call(x, None))
+
+    # `None` value passed as kwarg during construction is ignored at runtime.
     inputs = input_layer_lib.Input(10)
     outputs = my_layer(inputs, training=None)
     network = functional.Functional(inputs, outputs)
-
-    # `None` value passed during construction is overridden.
-    self.assertAllEqual(network(x, training=True), x)
-    # `None` value passed during construction is overridden.
-    self.assertAllEqual(network(x, training=False), x * 0.0)
-
+    self.assertAllEqual(network(x, training=True), _call(x, True))
+    self.assertAllEqual(network(x, training=False), _call(x, False))
+    if context.executing_eagerly():
+      self.assertAllEqual(network(x), _call(x, False))
+    else:
+      # in v1 training would have defaulted to using the `None` inside the layer
+      # if training is not passed at runtime
+      self.assertAllEqual(network(x), _call(x, None))
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/engine/sequential_test.py b/tensorflow/python/keras/engine/sequential_test.py
index 9589d24fc57..773ce003656 100644
--- a/tensorflow/python/keras/engine/sequential_test.py
+++ b/tensorflow/python/keras/engine/sequential_test.py
@@ -231,33 +231,28 @@ class TestSequential(keras_parameterized.TestCase):
     inner_model.trainable = True
     self.assertEqual(len(model.trainable_weights), 4)
 
+  @keras_parameterized.run_all_keras_modes
   def test_sequential_update_disabling(self):
     val_a = np.random.random((10, 4))
     val_out = np.random.random((10, 4))
 
-    with self.cached_session():
-      model = keras.models.Sequential()
-      model.add(keras.layers.BatchNormalization(input_shape=(4,)))
-      assert model.updates
+    model = keras.models.Sequential()
+    model.add(keras.layers.BatchNormalization(input_shape=(4,)))
 
-      model.trainable = False
-      assert not model.updates
+    model.trainable = False
+    model.compile('sgd', 'mse')
 
-      model.compile('sgd', 'mse')
-      assert not model.updates
+    x1 = model.predict(val_a)
+    model.train_on_batch(val_a, val_out)
+    x2 = model.predict(val_a)
+    self.assertAllClose(x1, x2, atol=1e-7)
 
-      x1 = model.predict(val_a)
-      model.train_on_batch(val_a, val_out)
-      x2 = model.predict(val_a)
-      self.assertAllClose(x1, x2, atol=1e-7)
+    model.trainable = True
+    model.compile('sgd', 'mse')
 
-      model.trainable = True
-      model.compile('sgd', 'mse')
-      assert model.updates
-
-      model.train_on_batch(val_a, val_out)
-      x2 = model.predict(val_a)
-      assert np.abs(np.sum(x1 - x2)) > 1e-5
+    model.train_on_batch(val_a, val_out)
+    x2 = model.predict(val_a)
+    assert np.abs(np.sum(x1 - x2)) > 1e-5
 
   @keras_parameterized.run_all_keras_modes
   def test_sequential_deferred_build_serialization(self):
diff --git a/tensorflow/python/keras/engine/training.py b/tensorflow/python/keras/engine/training.py
index 5567e1733a7..a0ebec4f95e 100644
--- a/tensorflow/python/keras/engine/training.py
+++ b/tensorflow/python/keras/engine/training.py
@@ -436,7 +436,6 @@ class Model(base_layer.Layer, version_utils.ModelVersionSelector):
                            'Instead, in order to instantiate and build your '
                            'model, `call` your model on real tensor data (of '
                            'the correct dtype).')
-
     super(Model, self).build(input_shape)
 
   def call(self, inputs, training=None, mask=None):
@@ -1979,7 +1978,11 @@ class Model(base_layer.Layer, version_utils.ModelVersionSelector):
     save.save_model(self, filepath, overwrite, include_optimizer, save_format,
                     signatures, options)
 
-  def save_weights(self, filepath, overwrite=True, save_format=None):
+  def save_weights(self,
+                   filepath,
+                   overwrite=True,
+                   save_format=None,
+                   options=None):
     """Saves all layer weights.
 
     Either saves in HDF5 or in TensorFlow format based on the `save_format`
@@ -2032,6 +2035,8 @@ class Model(base_layer.Layer, version_utils.ModelVersionSelector):
         save_format: Either 'tf' or 'h5'. A `filepath` ending in '.h5' or
             '.keras' will default to HDF5 if `save_format` is `None`. Otherwise
             `None` defaults to 'tf'.
+        options: Optional `tf.train.CheckpointOptions` object that specifies
+            options for saving weights.
 
     Raises:
         ImportError: If h5py is not available when attempting to save in HDF5
@@ -2093,7 +2098,7 @@ class Model(base_layer.Layer, version_utils.ModelVersionSelector):
              'the TensorFlow format the optimizer\'s state will not be '
              'saved.\n\nConsider using a TensorFlow optimizer from `tf.train`.')
             % (optimizer,))
-      self._trackable_saver.save(filepath, session=session)
+      self._trackable_saver.save(filepath, session=session, options=options)
       # Record this checkpoint so it's visible from tf.train.latest_checkpoint.
       checkpoint_management.update_checkpoint_state_internal(
           save_dir=os.path.dirname(filepath),
@@ -2411,6 +2416,12 @@ class Model(base_layer.Layer, version_utils.ModelVersionSelector):
 
     self._saved_model_inputs_spec = specs
 
+    # Store the input shapes
+    if (self.__class__.__name__ == 'Sequential' and
+        self._build_input_shape is None):
+      self._build_input_shape = nest.map_structure(
+          lambda x: None if x is None else x.shape, specs)
+
   def _assert_weights_created(self):
     """Asserts that all the weights for the model have been created.
 
diff --git a/tensorflow/python/keras/integration_test/BUILD b/tensorflow/python/keras/integration_test/BUILD
index 2ef775a190e..b23dcc59b97 100644
--- a/tensorflow/python/keras/integration_test/BUILD
+++ b/tensorflow/python/keras/integration_test/BUILD
@@ -2,6 +2,7 @@
 #   Contains Keras integration tests that verify with other TF high level APIs.
 
 load("//tensorflow:tensorflow.bzl", "cuda_py_test", "tf_py_test")
+load("//tensorflow/python/tpu:tpu.bzl", "tpu_py_test")
 
 package(
     default_visibility = [
@@ -91,3 +92,15 @@ cuda_py_test(
         "//tensorflow/python:extra_py_tests_deps",
     ],
 )
+
+tpu_py_test(
+    name = "tpu_strategy_test",
+    srcs = ["tpu_strategy_test.py"],
+    disable_experimental = True,
+    python_version = "PY3",
+    tags = ["no_oss"],
+    deps = [
+        "//tensorflow:tensorflow_py",
+        "//tensorflow/python:extra_py_tests_deps",
+    ],
+)
diff --git a/tensorflow/python/keras/integration_test/tpu_strategy_test.py b/tensorflow/python/keras/integration_test/tpu_strategy_test.py
new file mode 100644
index 00000000000..d24e96ae855
--- /dev/null
+++ b/tensorflow/python/keras/integration_test/tpu_strategy_test.py
@@ -0,0 +1,69 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for TPUStrategy."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl import flags
+
+import tensorflow as tf
+
+
+FLAGS = flags.FLAGS
+flags.DEFINE_string("tpu", "", "Name of TPU to connect to.")
+flags.DEFINE_string("project", None, "Name of GCP project with TPU.")
+flags.DEFINE_string("zone", None, "Name of GCP zone with TPU.")
+
+
+def get_tpu_cluster_resolver():
+  resolver = tf.distribute.cluster_resolver.TPUClusterResolver(
+      tpu=FLAGS.tpu,
+      zone=FLAGS.zone,
+      project=FLAGS.project,
+  )
+  return resolver
+
+
+def get_tpu_strategy():
+  resolver = get_tpu_cluster_resolver()
+  tf.config.experimental_connect_to_cluster(resolver)
+  tf.tpu.experimental.initialize_tpu_system(resolver)
+  return tf.distribute.experimental.TPUStrategy(resolver)
+
+
+class TpuStrategyTest(tf.test.TestCase):
+
+  def test_keras_metric_outside_strategy_scope_per_replica(self):
+    strategy = get_tpu_strategy()
+    metric = tf.keras.metrics.Mean("test_metric", dtype=tf.float32)
+
+    dataset = tf.data.Dataset.range(strategy.num_replicas_in_sync * 2).batch(2)
+    dataset = strategy.experimental_distribute_dataset(dataset)
+
+    @tf.function
+    def step_fn(i):
+      metric.update_state(i)
+
+    with self.assertRaisesRegex(ValueError, "Trying to run metric.update_state "
+                                            "in replica context"):
+      with strategy.scope():
+        for i in dataset:
+          strategy.run(step_fn, args=(i,))
+
+
+if __name__ == "__main__":
+  tf.test.main()
diff --git a/tensorflow/python/keras/layers/convolutional_recurrent.py b/tensorflow/python/keras/layers/convolutional_recurrent.py
index 19831429b73..6c812204cba 100644
--- a/tensorflow/python/keras/layers/convolutional_recurrent.py
+++ b/tensorflow/python/keras/layers/convolutional_recurrent.py
@@ -753,7 +753,9 @@ class ConvLSTM2D(ConvRNN2D):
       the `recurrent_kernel` weights matrix.
     bias_constraint: Constraint function applied to the bias vector.
     return_sequences: Boolean. Whether to return the last output
-      in the output sequence, or the full sequence.
+      in the output sequence, or the full sequence. (default False)
+    return_state: Boolean Whether to return the last state
+      in addition to the output. (default False)
     go_backwards: Boolean (default False).
       If True, process the input sequence backwards.
     stateful: Boolean (default False). If True, the last state
@@ -786,22 +788,27 @@ class ConvLSTM2D(ConvRNN2D):
         `(samples, time, rows, cols, channels)`
 
   Output shape:
-    - If `return_sequences`
-       - If data_format='channels_first'
-          5D tensor with shape:
-          `(samples, time, filters, output_row, output_col)`
-       - If data_format='channels_last'
-          5D tensor with shape:
-          `(samples, time, output_row, output_col, filters)`
-    - Else
-      - If data_format ='channels_first'
-          4D tensor with shape:
-          `(samples, filters, output_row, output_col)`
-      - If data_format='channels_last'
-          4D tensor with shape:
-          `(samples, output_row, output_col, filters)`
-      where `o_row` and `o_col` depend on the shape of the filter and
-      the padding
+    - If `return_state`: a list of tensors. The first tensor is
+      the output. The remaining tensors are the last states,
+      each 4D tensor with shape:
+      `(samples, filters, new_rows, new_cols)`
+      if data_format='channels_first'
+      or 4D tensor with shape:
+      `(samples, new_rows, new_cols, filters)`
+      if data_format='channels_last'.
+      `rows` and `cols` values might have changed due to padding.
+    - If `return_sequences`: 5D tensor with shape:
+      `(samples, timesteps, filters, new_rows, new_cols)`
+      if data_format='channels_first'
+      or 5D tensor with shape:
+      `(samples, timesteps, new_rows, new_cols, filters)`
+      if data_format='channels_last'.
+    - Else, 4D tensor with shape:
+      `(samples, filters, new_rows, new_cols)`
+      if data_format='channels_first'
+      or 4D tensor with shape:
+      `(samples, new_rows, new_cols, filters)`
+      if data_format='channels_last'.
 
   Raises:
     ValueError: in case of invalid constructor arguments.
@@ -834,6 +841,7 @@ class ConvLSTM2D(ConvRNN2D):
                recurrent_constraint=None,
                bias_constraint=None,
                return_sequences=False,
+               return_state=False,
                go_backwards=False,
                stateful=False,
                dropout=0.,
@@ -863,6 +871,7 @@ class ConvLSTM2D(ConvRNN2D):
                           dtype=kwargs.get('dtype'))
     super(ConvLSTM2D, self).__init__(cell,
                                      return_sequences=return_sequences,
+                                     return_state=return_state,
                                      go_backwards=go_backwards,
                                      stateful=stateful,
                                      **kwargs)
diff --git a/tensorflow/python/keras/layers/core.py b/tensorflow/python/keras/layers/core.py
index abfb025db30..e64a1c27bcf 100644
--- a/tensorflow/python/keras/layers/core.py
+++ b/tensorflow/python/keras/layers/core.py
@@ -825,10 +825,14 @@ class Lambda(Layer):
       returned as output mask regardless of what the input is.
     arguments: Optional dictionary of keyword arguments to be passed to the
       function.
-  Input shape: Arbitrary. Use the keyword argument input_shape (tuple of
+
+  Input shape:
+    Arbitrary. Use the keyword argument input_shape (tuple of
     integers, does not include the samples axis) when using this layer as the
     first layer in a model.
-  Output shape: Specified by `output_shape` argument
+
+  Output shape:
+    Specified by `output_shape` argument
   """
 
   @trackable.no_automatic_dependency_tracking
diff --git a/tensorflow/python/keras/layers/normalization_test.py b/tensorflow/python/keras/layers/normalization_test.py
index ef43bcf5d22..39992f7580a 100644
--- a/tensorflow/python/keras/layers/normalization_test.py
+++ b/tensorflow/python/keras/layers/normalization_test.py
@@ -325,18 +325,18 @@ class BatchNormalizationV2Test(keras_parameterized.TestCase):
       norm(inp)
 
   def test_updates_in_wrap_function(self):
-    layer = normalization.BatchNormalization()
 
     def my_func():
+      layer = normalization.BatchNormalization()
       x = array_ops.ones((10, 1))
-      return layer(x, training=True)
+      y = layer(x, training=True)
+      # Updates should be tracked in a `wrap_function`.
+      self.assertLen(layer.updates, 2)
+      return y
 
     wrapped_fn = wrap_function.wrap_function(my_func, [])
     wrapped_fn()
 
-    # Updates should be tracked in a `wrap_function`.
-    self.assertLen(layer.updates, 2)
-
   @keras_parameterized.run_all_keras_modes
   def test_basic_batchnorm_v2_none_shape_and_virtual_batch_size(self):
     # Test case for GitHub issue for 32380
@@ -392,15 +392,11 @@ class NormalizationLayersGraphModeOnlyTest(
       model.compile(gradient_descent.GradientDescentOptimizer(0.01), 'mse')
       model.train_on_batch(x, x)
 
-      self.assertLen(bn.updates, 4)
-
       # Test model-level reuse
       x3 = keras.layers.Input(shape=(10,))
       y3 = model(x3)
       new_model = keras.models.Model(x3, y3, name='new_model')
 
-      self.assertLen(new_model.updates, 6)
-      self.assertLen(model.updates, 6)
       new_model.compile(gradient_descent.GradientDescentOptimizer(0.01), 'mse')
       new_model.train_on_batch(x, x)
 
@@ -415,10 +411,7 @@ class NormalizationLayersGraphModeOnlyTest(
       model = keras.models.Model(a, b)
 
       model.trainable = False
-      assert not model.updates
-
       model.compile(gradient_descent.GradientDescentOptimizer(0.01), 'mse')
-      assert not model.updates
 
       x1 = model.predict(val_a)
       model.train_on_batch(val_a, val_out)
@@ -427,7 +420,6 @@ class NormalizationLayersGraphModeOnlyTest(
 
       model.trainable = True
       model.compile(gradient_descent.GradientDescentOptimizer(0.01), 'mse')
-      assert model.updates
 
       model.train_on_batch(val_a, val_out)
       x2 = model.predict(val_a)
@@ -435,7 +427,6 @@ class NormalizationLayersGraphModeOnlyTest(
 
       layer.trainable = False
       model.compile(gradient_descent.GradientDescentOptimizer(0.01), 'mse')
-      assert not model.updates
 
       x1 = model.predict(val_a)
       model.train_on_batch(val_a, val_out)
diff --git a/tensorflow/python/keras/layers/preprocessing/BUILD b/tensorflow/python/keras/layers/preprocessing/BUILD
index af7f6392219..6916712d52c 100644
--- a/tensorflow/python/keras/layers/preprocessing/BUILD
+++ b/tensorflow/python/keras/layers/preprocessing/BUILD
@@ -361,6 +361,22 @@ distribute_py_test(
     ],
 )
 
+distribute_py_test(
+    name = "image_preprocessing_distribution_test",
+    srcs = ["image_preprocessing_distribution_test.py"],
+    main = "image_preprocessing_distribution_test.py",
+    python_version = "PY3",
+    tags = [
+        "multi_and_single_gpu",
+    ],
+    deps = [
+        ":category_crossing",
+        "//tensorflow/python/distribute:combinations",
+        "//tensorflow/python/distribute:strategy_combinations",
+        "//tensorflow/python/keras",
+    ],
+)
+
 tf_py_test(
     name = "discretization_test",
     size = "small",
diff --git a/tensorflow/python/keras/layers/preprocessing/image_preprocessing.py b/tensorflow/python/keras/layers/preprocessing/image_preprocessing.py
index 074652a0b79..dd741c8c72c 100644
--- a/tensorflow/python/keras/layers/preprocessing/image_preprocessing.py
+++ b/tensorflow/python/keras/layers/preprocessing/image_preprocessing.py
@@ -1306,6 +1306,18 @@ class _RandomGenerator(stateful_random_ops.Generator):
   numbers.
   """
 
+  # TODO(b/157995497): Temporarily use primary variable handle inside cross
+  # replica context.
+  @property
+  def state(self):
+    """The internal state of the RNG."""
+    state_var = self._state_var
+    try:
+      _ = getattr(state_var, 'handle')
+      return state_var
+    except ValueError:
+      return state_var.values[0]
+
   def _create_variable(self, *args, **kwargs):
     # This function does the same thing as the base class's namesake, except
     # that it skips the distribution-strategy check. When we are inside a
diff --git a/tensorflow/python/keras/layers/preprocessing/image_preprocessing_distribution_test.py b/tensorflow/python/keras/layers/preprocessing/image_preprocessing_distribution_test.py
new file mode 100644
index 00000000000..0b93c1d57c6
--- /dev/null
+++ b/tensorflow/python/keras/layers/preprocessing/image_preprocessing_distribution_test.py
@@ -0,0 +1,67 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for keras.layers.preprocessing.normalization."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python import keras
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import strategy_combinations
+from tensorflow.python.framework import dtypes
+from tensorflow.python.keras import keras_parameterized
+from tensorflow.python.keras.layers.preprocessing import image_preprocessing
+from tensorflow.python.keras.layers.preprocessing import preprocessing_test_utils
+from tensorflow.python.platform import test
+
+
+@combinations.generate(
+    combinations.combine(
+        distribution=strategy_combinations.all_strategies,
+        mode=["eager", "graph"]))
+class ImagePreprocessingDistributionTest(
+    keras_parameterized.TestCase,
+    preprocessing_test_utils.PreprocessingLayerTest):
+
+  def test_distribution(self, distribution):
+    np_images = np.random.random((1000, 32, 32, 3)).astype(np.float32)
+    image_dataset = dataset_ops.Dataset.from_tensor_slices(np_images).batch(
+        32, drop_remainder=True)
+
+    with distribution.scope():
+      input_data = keras.Input(shape=(32, 32, 3), dtype=dtypes.float32)
+      image_preprocessor = keras.Sequential([
+          image_preprocessing.Resizing(height=256, width=256),
+          image_preprocessing.RandomCrop(height=224, width=224),
+          image_preprocessing.RandomTranslation(.1, .1),
+          image_preprocessing.RandomRotation(.2),
+          image_preprocessing.RandomFlip(),
+          image_preprocessing.RandomZoom(.2, .2)])
+      preprocessed_image = image_preprocessor(input_data)
+      flatten_layer = keras.layers.Flatten(data_format="channels_last")
+      output = flatten_layer(preprocessed_image)
+      cls_layer = keras.layers.Dense(units=1, activation="sigmoid")
+      output = cls_layer(output)
+      model = keras.Model(inputs=input_data, outputs=preprocessed_image)
+    model.compile(loss="binary_crossentropy")
+    _ = model.predict(image_dataset)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/keras/layers/preprocessing/image_preprocessing_test.py b/tensorflow/python/keras/layers/preprocessing/image_preprocessing_test.py
index 24011225a29..5cb7cec5b7b 100644
--- a/tensorflow/python/keras/layers/preprocessing/image_preprocessing_test.py
+++ b/tensorflow/python/keras/layers/preprocessing/image_preprocessing_test.py
@@ -976,7 +976,7 @@ class RandomRotationTest(keras_parameterized.TestCase):
         output = strat.run(lambda: layer(input_images, training=True))
       values = output.values
       self.assertAllEqual(2, len(values))
-      self.assertAllClose(values[0], values[1])
+      self.assertAllClose(values[0], values[1], rtol=1e-5)
 
   @tf_test_util.run_v2_only
   def test_config_with_custom_name(self):
diff --git a/tensorflow/python/keras/layers/preprocessing/normalization.py b/tensorflow/python/keras/layers/preprocessing/normalization.py
index 09564cbb064..ba2f7eaae89 100644
--- a/tensorflow/python/keras/layers/preprocessing/normalization.py
+++ b/tensorflow/python/keras/layers/preprocessing/normalization.py
@@ -53,10 +53,13 @@ class Normalization(CombinerPreprocessingLayer):
 
   Attributes:
       axis: Integer or tuple of integers, the axis or axes that should be
-        normalized (typically the features axis). We will normalize each element
-        in the specified axis. If set to 'None', the layer will perform scalar
-        normalization (diving the input by a single scalar value). 0 (the batch
-        axis) is not allowed.
+        "kept". These axes are not be summed over when calculating the
+        normalization statistics. By default the last axis, the `features` axis
+        is kept and any `space` or `time` axes are summed. Each element in the
+        the axes that are kept is normalized independently. If `axis` is set to
+        'None', the layer will perform scalar normalization (diving the input
+        by a single scalar value). The `batch` axis, 0, is always summed over
+        (`axis=0` is not allowed).
 
   Examples:
 
@@ -78,10 +81,18 @@ class Normalization(CombinerPreprocessingLayer):
     # time, the dtype value will change to reflect it.
     dtype = dtype or K.floatx()
 
+    # Standardize `axis` to a tuple.
+    if axis is None:
+      axis = ()
+    elif isinstance(axis, int):
+      axis = (axis,)
+    else:
+      axis = tuple(axis)
+
     super(Normalization, self).__init__(
         combiner=_NormalizingCombiner(axis), dtype=dtype, **kwargs)
 
-    if axis == 0:
+    if 0 in axis:
       raise ValueError('The argument \'axis\' may not be 0.')
 
     self.axis = axis
@@ -90,18 +101,27 @@ class Normalization(CombinerPreprocessingLayer):
     input_shape = tensor_shape.TensorShape(input_shape).as_list()
     if len(input_shape) == 1:
       input_shape = input_shape + [1]
+
+    ndim = len(input_shape)
+
+    # Sort `self.axis` to avoid transposing `mean_and_var_shape`.
+    # Negative axes are not sortable until you know the number of dimensions.
+    original_axis = self.axis
+    self.axis = tuple(sorted(self.axis,
+                             key=lambda a: a if a >= 0 else ndim + a))
+
+    if any(a < 1-ndim for a in self.axis) or any(a >= ndim for a in self.axis):
+      raise ValueError('All `axis` values must be in '
+                       'the range [1-ndim, ndim-1].\n'
+                       'Got:\n'
+                       '    ndim: {}\n'
+                       '    axis: {}'.format(ndim, original_axis))
+
     self._broadcast_shape = [1 for _ in range(len(input_shape))]
-    if isinstance(self.axis, (tuple, list)):
-      mean_and_var_shape = []
-      for i in self.axis:
-        mean_and_var_shape.append(input_shape[i])
-        self._broadcast_shape[i] = input_shape[i]
-    else:
-      if self.axis is None:
-        mean_and_var_shape = ()
-      else:
-        mean_and_var_shape = input_shape[self.axis]
-        self._broadcast_shape[self.axis] = input_shape[self.axis]
+    mean_and_var_shape = []
+    for i in self.axis:
+      mean_and_var_shape.append(input_shape[i])
+      self._broadcast_shape[i] = input_shape[i]
 
     # count is not used in this class's call() method, but is used to re-create
     # the accumulator during multiple calls to 'adapt'.
@@ -179,11 +199,13 @@ class _NormalizingCombiner(Combiner):
     if values.ndim == 1:
       values = np.expand_dims(values, 1)
 
+    # `np.delete` ignores negative indexes, so use a mask to delete items.
+    axis_mask = np.ones([values.ndim], dtype=bool)
+    axis_mask[np.array(self.axis, dtype=np.int32)] = False
+
     # This is the shape of all reduced axes (not specified in 'axis').
-    if self.axis is None:
-      reduction_counts = values.shape
-    else:
-      reduction_counts = np.delete(values.shape, self.axis)
+
+    reduction_counts = np.array(values.shape)[axis_mask]
     # We get the number of elements that will be reduced by multiplying all
     # values of 'shape' corresponding to the reduced axes.
     count = np.prod(reduction_counts, dtype=np.int64)
@@ -191,10 +213,7 @@ class _NormalizingCombiner(Combiner):
     # We want to reduce across dimensions except those specified in 'axis'
     # when using np.mean or np.variance; create the tuple of axes to reduce
     # over here.
-    if self.axis is None:
-      reduction_axes = None
-    else:
-      reduction_axes = tuple(np.delete(range(values.ndim), self.axis))
+    reduction_axes = tuple(np.arange(values.ndim)[axis_mask])
 
     mean = np.mean(values, axis=reduction_axes, dtype=np.float64)
     variance = np.var(values, axis=reduction_axes, dtype=np.float64)
diff --git a/tensorflow/python/keras/layers/preprocessing/normalization_test.py b/tensorflow/python/keras/layers/preprocessing/normalization_test.py
index 75ef9370899..f97b8db50ec 100644
--- a/tensorflow/python/keras/layers/preprocessing/normalization_test.py
+++ b/tensorflow/python/keras/layers/preprocessing/normalization_test.py
@@ -275,6 +275,61 @@ class NormalizationTest(keras_parameterized.TestCase,
     if context.executing_eagerly():
       self.assertAllClose(output.numpy(), [[-1], [1], [-1], [1]])
 
+  @parameterized.parameters(
+      {"axis": 0},
+      {"axis": (-1, 0)},
+  )
+  def test_zeros_fail_init(self, axis):
+    cls = get_layer_class()
+    with self.assertRaisesRegex(ValueError,
+                                "The argument 'axis' may not be 0."):
+      cls(axis=axis)
+
+  @parameterized.parameters(
+      # Out of bounds
+      {"axis": 3},
+      {"axis": -3},
+      # In a tuple
+      {"axis": (1, 3)},
+      {"axis": (1, -3)},
+  )
+  def test_bad_axis_fail_build(self, axis):
+    cls = get_layer_class()
+    layer = cls(axis=axis)
+    with self.assertRaisesRegex(ValueError,
+                                r"in the range \[1-ndim, ndim-1\]."):
+      layer.build([None, 2, 3])
+
+  @parameterized.parameters(
+      # Results should be identical no matter how the axes are specified (3d).
+      {"axis": (1, 2)},
+      {"axis": (2, 1)},
+      {"axis": (1, -1)},
+      {"axis": (-1, 1)},
+  )
+  def test_axis_permutations(self, axis):
+    cls = get_layer_class()
+    layer = cls(axis=axis)
+    # data.shape = [2, 2, 3]
+    data = np.array([[[0., 1., 2.], [0., 2., 6.]],
+                     [[2., 3., 4.], [3., 6., 10.]]])
+    expect = np.array([[[-1., -1., -1.], [-1., -1., -1.]],
+                       [[1., 1., 1.], [1., 1., 1.]]])
+    layer.adapt(data)
+    self.assertAllClose(expect, layer(data))
+
+  def test_model_summary_after_layer_adapt(self):
+    data = np.array([[[0., 1., 2.], [0., 2., 6.]],
+                     [[2., 3., 4.], [3., 6., 10.]]])
+    cls = get_layer_class()
+    layer = cls(axis=-1)
+    layer.adapt(data)
+    model = keras.Sequential(
+        [layer,
+         keras.layers.Dense(64, activation="relu"),
+         keras.layers.Dense(1)])
+    model.summary()
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/keras/layers/preprocessing/text_vectorization_test.py b/tensorflow/python/keras/layers/preprocessing/text_vectorization_test.py
index 992f47efc85..e7f61e94724 100644
--- a/tensorflow/python/keras/layers/preprocessing/text_vectorization_test.py
+++ b/tensorflow/python/keras/layers/preprocessing/text_vectorization_test.py
@@ -1503,7 +1503,7 @@ class TextVectorizationSavingTest(
     loaded_model = keras.models.load_model(output_path)
     self.assertAllEqual(loaded_model.predict(input_array), expected_output)
 
-  def DISABLE_test_saving_with_tfidf(self):
+  def test_saving_with_tfidf(self):
     vocab_data = ["earth", "wind", "and", "fire"]
     tfidf_data = [.5, .25, .2, .125]
     input_array = np.array([["earth", "wind", "and", "earth"],
diff --git a/tensorflow/python/keras/layers/recurrent.py b/tensorflow/python/keras/layers/recurrent.py
index 0ce17c6101e..78a4a33a533 100644
--- a/tensorflow/python/keras/layers/recurrent.py
+++ b/tensorflow/python/keras/layers/recurrent.py
@@ -19,8 +19,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import collections
-
 import numpy as np
 
 from tensorflow.python.distribute import distribution_strategy_context as ds_context
@@ -49,6 +47,11 @@ from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import keras_export
 from tensorflow.tools.docs import doc_controls
 
+try:
+  from collections import abc as collections_abc  # pylint: disable=g-import-not-at-top
+except ImportError:  # For Python 2
+  import collections as collections_abc  # pylint: disable=g-import-not-at-top
+
 
 RECURRENT_DROPOUT_WARNING_MSG = (
     'RNN `implementation=2` is not supported when `recurrent_dropout` is set. '
@@ -828,7 +831,7 @@ class RNN(Layer):
     # input shape: `(samples, time (padded with zeros), input_dim)`
     # note that the .build() method of subclasses MUST define
     # self.input_spec and self.state_spec with complete input shapes.
-    if (isinstance(inputs, collections.Sequence)
+    if (isinstance(inputs, collections_abc.Sequence)
         and not isinstance(inputs, tuple)):
       # get initial_state from full input spec
       # as they could be copied to multiple GPU.
diff --git a/tensorflow/python/keras/layers/rnn_cell_wrapper_v2.py b/tensorflow/python/keras/layers/rnn_cell_wrapper_v2.py
index 4356244b292..d387a375aa2 100644
--- a/tensorflow/python/keras/layers/rnn_cell_wrapper_v2.py
+++ b/tensorflow/python/keras/layers/rnn_cell_wrapper_v2.py
@@ -27,6 +27,7 @@ from __future__ import print_function
 
 from tensorflow.python.keras.layers import recurrent
 from tensorflow.python.ops import rnn_cell_wrapper_impl
+from tensorflow.python.util import tf_inspect
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -41,6 +42,10 @@ class _RNNCellWrapperV2(recurrent.AbstractRNNCell):
   def __init__(self, cell, *args, **kwargs):
     super(_RNNCellWrapperV2, self).__init__(*args, **kwargs)
     self.cell = cell
+    cell_call_spec = tf_inspect.getfullargspec(cell.call)
+    self._expects_training_arg = ("training" in cell_call_spec.args) or (
+        cell_call_spec.varkw is not None
+    )
 
   def call(self, inputs, state, **kwargs):
     """Runs the RNN cell step computation.
diff --git a/tensorflow/python/keras/layers/wrappers.py b/tensorflow/python/keras/layers/wrappers.py
index 8fe3b3b20bb..23fef467cfe 100644
--- a/tensorflow/python/keras/layers/wrappers.py
+++ b/tensorflow/python/keras/layers/wrappers.py
@@ -341,10 +341,11 @@ class Bidirectional(Wrapper):
       combined. One of {'sum', 'mul', 'concat', 'ave', None}. If None, the
       outputs will not be combined, they will be returned as a list. Default
       value is 'concat'.
-    backward_layer: Optional `keras.layers.RNN`, or keras.layers.Layer` instance
-      to be used to handle backwards input processing. If `backward_layer` is
-      not provided, the layer instance passed as the `layer` argument will be
-      used to generate the backward layer automatically.
+    backward_layer: Optional `keras.layers.RNN`, or `keras.layers.Layer`
+      instance to be used to handle backwards input processing.
+      If `backward_layer` is not provided, the layer instance passed as the
+      `layer` argument will be used to generate the backward layer
+      automatically.
       Note that the provided `backward_layer` layer should have properties
       matching those of the `layer` argument, in particular it should have the
       same values for `stateful`, `return_states`, `return_sequence`, etc.
diff --git a/tensorflow/python/keras/layers/wrappers_test.py b/tensorflow/python/keras/layers/wrappers_test.py
index a73177fff12..5ee794dd1ef 100644
--- a/tensorflow/python/keras/layers/wrappers_test.py
+++ b/tensorflow/python/keras/layers/wrappers_test.py
@@ -234,13 +234,10 @@ class TimeDistributedTest(keras_parameterized.TestCase):
     x = keras.layers.Input(shape=(3, 2))
     layer = keras.layers.TimeDistributed(keras.layers.BatchNormalization())
     _ = layer(x)
-    self.assertEqual(len(layer.updates), 2)
     self.assertEqual(len(layer.trainable_weights), 2)
     layer.trainable = False
-    assert not layer.updates
     assert not layer.trainable_weights
     layer.trainable = True
-    assert len(layer.updates) == 2
     assert len(layer.trainable_weights) == 2
 
   def test_TimeDistributed_with_masked_embedding_and_unspecified_shape(self):
diff --git a/tensorflow/python/keras/metrics.py b/tensorflow/python/keras/metrics.py
index a67755b9333..7f40423595b 100644
--- a/tensorflow/python/keras/metrics.py
+++ b/tensorflow/python/keras/metrics.py
@@ -630,10 +630,9 @@ class MeanMetricWrapper(Mean):
   def from_config(cls, config):
     # Note that while MeanMetricWrapper itself isn't public, objects of this
     # class may be created and added to the model by calling model.compile.
+    fn = config.pop('fn', None)
     if cls is MeanMetricWrapper:
-      fn = get(config.pop('fn'))
-      return cls(fn, **config)
-
+      return cls(get(fn), **config)
     return super(MeanMetricWrapper, cls).from_config(config)
 
 
diff --git a/tensorflow/python/keras/mixed_precision/experimental/BUILD b/tensorflow/python/keras/mixed_precision/experimental/BUILD
index 024b093c469..4060e455f84 100644
--- a/tensorflow/python/keras/mixed_precision/experimental/BUILD
+++ b/tensorflow/python/keras/mixed_precision/experimental/BUILD
@@ -144,9 +144,10 @@ py_test(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework",
         "//tensorflow/python:platform_test",
+        "//tensorflow/python/distribute:combinations",
         "//tensorflow/python/distribute:mirrored_strategy",
+        "//tensorflow/python/distribute:strategy_combinations",
         "//tensorflow/python/eager:context",
-        "//tensorflow/python/keras:combinations",
         "//tensorflow/python/keras/optimizer_v2",
         "@absl_py//absl/testing:parameterized",
     ],
diff --git a/tensorflow/python/keras/mixed_precision/experimental/autocast_variable.py b/tensorflow/python/keras/mixed_precision/experimental/autocast_variable.py
index 7d0abe30581..57e8ced65a0 100644
--- a/tensorflow/python/keras/mixed_precision/experimental/autocast_variable.py
+++ b/tensorflow/python/keras/mixed_precision/experimental/autocast_variable.py
@@ -285,6 +285,13 @@ class AutoCastVariable(variables.Variable, core.Tensor):
     # models with normal variables, and vice versa.
     return self._variable._gather_saveables_for_checkpoint()  # pylint:disable=protected-access
 
+  def _map_resources(self):
+    # By delegating this method to the wrapped variable, SavedModel with
+    # AutoCastVariables are identical to SavedModel with normal variables.
+    obj_map, resource_map = self._variable._map_resources()  # pylint:disable=protected-access
+    obj_map[self] = obj_map[self._variable]
+    return obj_map, resource_map
+
   # TODO(reedwm): Maybe encode the fact the variable is an AutoCastVariable in
   # to_proto().
   def to_proto(self, export_scope=None):
diff --git a/tensorflow/python/keras/mixed_precision/experimental/autocast_variable_test.py b/tensorflow/python/keras/mixed_precision/experimental/autocast_variable_test.py
index 78041973cc1..964118136d4 100644
--- a/tensorflow/python/keras/mixed_precision/experimental/autocast_variable_test.py
+++ b/tensorflow/python/keras/mixed_precision/experimental/autocast_variable_test.py
@@ -23,14 +23,16 @@ from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python import tf2
+from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import distribution_strategy_context as ds_context
 from tensorflow.python.distribute import mirrored_strategy
+from tensorflow.python.distribute import strategy_combinations
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import indexed_slices
 from tensorflow.python.framework import ops
-from tensorflow.python.keras import combinations
 from tensorflow.python.keras.mixed_precision.experimental import autocast_variable
 from tensorflow.python.keras.optimizer_v2 import gradient_descent as gradient_descent_v2
 from tensorflow.python.ops import array_ops
@@ -40,29 +42,10 @@ from tensorflow.python.platform import test
 from tensorflow.python.training import gradient_descent as gradient_descent_v1
 from tensorflow.python.training.tracking import util as trackable_utils
 
-TESTCASES = ({
-    'testcase_name': 'base',
-    'distribute': False
-}, {
-    'testcase_name': 'distribute',
-    'distribute': True
-})
-
-
-def get_distribute_scope(distribute):
-
-  class DummyContextManager(object):
-
-    def __enter__(self):
-      pass
-
-    def __exit__(self, *args):
-      pass
-
-  if distribute:
-    return mirrored_strategy.MirroredStrategy(['cpu:0']).scope()
-  else:
-    return DummyContextManager()
+maybe_distribute = combinations.combine(distribution=[
+    strategy_combinations.default_strategy,
+    strategy_combinations.mirrored_strategy_with_cpu_1_and_2
+])
 
 
 def get_var(val, dtype, name=None):
@@ -72,9 +55,13 @@ def get_var(val, dtype, name=None):
 @combinations.generate(combinations.combine(mode=['graph', 'eager']))
 class AutoCastVariableTest(test.TestCase, parameterized.TestCase):
 
-  @parameterized.named_parameters(*TESTCASES)
-  def test_read(self, distribute):
-    with get_distribute_scope(distribute):
+  def setUp(self):
+    strategy_combinations.set_virtual_cpus_to_at_least(3)
+    super(AutoCastVariableTest, self).setUp()
+
+  @combinations.generate(maybe_distribute)
+  def test_read(self, distribution):
+    with distribution.scope():
       x = get_var(1., dtypes.float32)
       x = autocast_variable.create_autocast_variable(x)
       self.evaluate(x.initializer)
@@ -116,9 +103,9 @@ class AutoCastVariableTest(test.TestCase, parameterized.TestCase):
       self.assertEqual(x.sparse_read([0]).dtype, dtypes.float16)
       self.assertEqual(x.gather_nd([0]).dtype, dtypes.float16)
 
-  @parameterized.named_parameters(*TESTCASES)
-  def test_read_nested_scopes(self, distribute):
-    with get_distribute_scope(distribute):
+  @combinations.generate(maybe_distribute)
+  def test_read_nested_scopes(self, distribution):
+    with distribution.scope():
       x = get_var(1., dtypes.float32)
       x = autocast_variable.create_autocast_variable(x)
       self.evaluate(x.initializer)
@@ -136,9 +123,9 @@ class AutoCastVariableTest(test.TestCase, parameterized.TestCase):
         self.assertEqual(x.dtype, dtypes.float16)
         self.assertEqual(x.read_value().dtype, dtypes.float16)
 
-  @parameterized.named_parameters(*TESTCASES)
-  def test_dtype_is_not_string(self, distribute):
-    with get_distribute_scope(distribute):
+  @combinations.generate(maybe_distribute)
+  def test_dtype_is_not_string(self, distribution):
+    with distribution.scope():
       x = get_var(1., dtypes.float32)
       x = autocast_variable.create_autocast_variable(x)
       self.assertEqual(x.dtype, dtypes.float32)
@@ -153,13 +140,13 @@ class AutoCastVariableTest(test.TestCase, parameterized.TestCase):
         self.assertEqual(x.true_dtype, dtypes.float32)
         self.assertIsInstance(x.true_dtype, dtypes.DType)
 
-  @parameterized.named_parameters(*TESTCASES)
-  def test_method_delegations(self, distribute):
+  @combinations.generate(maybe_distribute)
+  def test_method_delegations(self, distribution):
     # Test AutoCastVariable correctly delegates Variable methods to the
     # underlying variable.
-    with self.test_session(), get_distribute_scope(distribute):
+    with self.test_session(), distribution.scope():
       for read_dtype in (dtypes.float32, dtypes.float16):
-        if distribute:
+        if ds_context.has_strategy():
           # MirroredVariable.assign will (incorrectly) return a Mirrored value
           # instead of a MirroredVariable. So we cannot properly wrap it in an
           # AutoCastVariable.
@@ -183,14 +170,14 @@ class AutoCastVariableTest(test.TestCase, parameterized.TestCase):
           self.assertEqual(x.aggregation, x._variable.aggregation)
           self.assertEqual(self.evaluate(x.initialized_value()), 7)
           if not context.executing_eagerly():
-            if not distribute:
+            if not ds_context.has_strategy():
               # These functions are not supported for DistributedVariables
               x.load(9)
               self.assertEqual(x.eval(), 9)
             self.assertEqual(self.evaluate(x.initial_value), 7)
             self.assertEqual(x.op, x._variable.op)
             self.assertEqual(x.graph, x._variable.graph)
-          if not distribute:
+          if not ds_context.has_strategy():
             # These attributes are not supported for DistributedVariables
             self.assertIsNone(x.constraint)
             self.assertEqual(x.initializer, x._variable.initializer)
@@ -202,7 +189,7 @@ class AutoCastVariableTest(test.TestCase, parameterized.TestCase):
           self.assertEqual(x.shape, ())
           self.assertEqual(x.get_shape(), ())
 
-        if not distribute:
+        if not ds_context.has_strategy():
           # Test scatter_* methods. These are not supported for
           # DistributedVariables
           x = get_var([7, 8], dtypes.float32)
@@ -233,9 +220,9 @@ class AutoCastVariableTest(test.TestCase, parameterized.TestCase):
             self.assertAllEqual(
                 evaluate(x.scatter_nd_update([[0], [1]], [1., 2.])), [1, 2])
 
-  @parameterized.named_parameters(*TESTCASES)
-  def test_operator_overloads(self, distribute):
-    with get_distribute_scope(distribute):
+  @combinations.generate(maybe_distribute)
+  def test_operator_overloads(self, distribution):
+    with distribution.scope():
       for read_dtype in (dtypes.float32, dtypes.float16):
         x = get_var(7., dtypes.float32)
         x = autocast_variable.create_autocast_variable(x)
@@ -280,9 +267,9 @@ class AutoCastVariableTest(test.TestCase, parameterized.TestCase):
             self.assertAllEqual(x == [7., 8., 10.], [True, True, False])
             self.assertAllEqual(x != [7., 8., 10.], [False, False, True])
 
-  @parameterized.named_parameters(*TESTCASES)
-  def test_assign(self, distribute):
-    with get_distribute_scope(distribute):
+  @combinations.generate(maybe_distribute)
+  def test_assign(self, distribution):
+    with distribution.scope():
       x = get_var(0., dtypes.float32)
       x = autocast_variable.create_autocast_variable(x)
       self.evaluate(x.initializer)
@@ -318,18 +305,20 @@ class AutoCastVariableTest(test.TestCase, parameterized.TestCase):
         self.assertAllClose(3., self.evaluate(x.assign_sub(3.)))
 
         # Assign multiple times
-        assign = x.assign(1.)
-        self.assertAllClose(1., self.evaluate(assign))
-        self.assertAllClose(0., self.evaluate(assign.assign(0.)))
-        assign_add = x.assign_add(3.)
-        self.assertAllClose(3., self.evaluate(assign_add))
-        self.assertAllClose(3. * 3,
-                            self.evaluate(x.assign_add(3.).assign_add(3.)))
-        self.assertAllClose(3. * 3, x)
-        assign_sub = x.assign_sub(3.)
-        self.assertAllClose(3. * 2, self.evaluate(assign_sub))
-        self.assertAllClose(0.,
-                            self.evaluate(x.assign_sub(3.).assign_sub(3.)))
+        # This currently only works if no strategy is used
+        if not ds_context.has_strategy():
+          assign = x.assign(1.)
+          self.assertAllClose(1., self.evaluate(assign))
+          self.assertAllClose(0., self.evaluate(assign.assign(0.)))
+          assign_add = x.assign_add(3.)
+          self.assertAllClose(3., self.evaluate(assign_add))
+          self.assertAllClose(3. * 3,
+                              self.evaluate(x.assign_add(3.).assign_add(3.)))
+          self.assertAllClose(3. * 3, x)
+          assign_sub = x.assign_sub(3.)
+          self.assertAllClose(3. * 2, self.evaluate(assign_sub))
+          self.assertAllClose(0.,
+                              self.evaluate(x.assign_sub(3.).assign_sub(3.)))
 
         # Assign with read_value=False
         self.assertIsNone(self.evaluate(x.assign(1., read_value=False)))
@@ -355,9 +344,9 @@ class AutoCastVariableTest(test.TestCase, parameterized.TestCase):
         # assign still expect float32 value even if in float16 scope
         run_and_check()
 
-  @parameterized.named_parameters(*TESTCASES)
-  def test_assign_stays_in_true_dtype(self, distribute):
-    with get_distribute_scope(distribute):
+  @combinations.generate(maybe_distribute)
+  def test_assign_stays_in_true_dtype(self, distribution):
+    with distribution.scope():
       x = get_var(1., dtypes.float32)
       x = autocast_variable.create_autocast_variable(x)
       self.evaluate(x.initializer)
@@ -382,10 +371,10 @@ class AutoCastVariableTest(test.TestCase, parameterized.TestCase):
         self.assertEqual(1., self.evaluate(x.value()))
       self.assertEqual(1. + small_val, self.evaluate(x.value()))
 
-  @parameterized.named_parameters(*TESTCASES)
-  def test_checkpoint(self, distribute):
+  @combinations.generate(maybe_distribute)
+  def test_checkpoint(self, distribution):
     with self.test_session():
-      with get_distribute_scope(distribute):
+      with distribution.scope():
         x = get_var(1., dtypes.float32)
         x = autocast_variable.create_autocast_variable(x)
       self.evaluate(x.initializer)
@@ -398,9 +387,9 @@ class AutoCastVariableTest(test.TestCase, parameterized.TestCase):
       checkpoint.restore(save_path).assert_consumed().run_restore_ops()
       self.assertEqual(self.evaluate(x), 123.)
 
-  @parameterized.named_parameters(*TESTCASES)
-  def test_invalid_wrapped_variable(self, distribute):
-    with get_distribute_scope(distribute):
+  @combinations.generate(maybe_distribute)
+  def test_invalid_wrapped_variable(self, distribution):
+    with distribution.scope():
       # Wrap a non-variable
       with self.assertRaisesRegexp(ValueError, 'variable must be of type'):
         x = constant_op.constant([1.], dtype=dtypes.float32)
@@ -443,7 +432,7 @@ class AutoCastVariableTest(test.TestCase, parameterized.TestCase):
         )
 
   def test_repr_distributed(self):
-    with get_distribute_scope(distribute=True):
+    with mirrored_strategy.MirroredStrategy(['/cpu:1', '/cpu:2']).scope():
       x = get_var(1., dtypes.float32)
       x = autocast_variable.create_autocast_variable(x)
       self.assertRegexpMatches(
diff --git a/tensorflow/python/keras/preprocessing/image_dataset.py b/tensorflow/python/keras/preprocessing/image_dataset.py
index d287c4ef372..7c7df083559 100644
--- a/tensorflow/python/keras/preprocessing/image_dataset.py
+++ b/tensorflow/python/keras/preprocessing/image_dataset.py
@@ -147,7 +147,7 @@ def image_dataset_from_directory(directory,
           'directory. If you wish to infer the labels from the subdirectory '
           'names in the target directory, pass `labels="inferred"`. '
           'If you wish to get a dataset that only contains images '
-          '(no labels), pass `labels=None`.')
+          '(no labels), pass `label_mode=None`.')
     if class_names:
       raise ValueError('You can only pass `class_names` if the labels are '
                        'inferred from the subdirectory names in the target '
diff --git a/tensorflow/python/keras/saving/hdf5_format.py b/tensorflow/python/keras/saving/hdf5_format.py
index 800d609fe99..01a5e12e4c6 100644
--- a/tensorflow/python/keras/saving/hdf5_format.py
+++ b/tensorflow/python/keras/saving/hdf5_format.py
@@ -192,6 +192,7 @@ def load_model_from_hdf5(filepath, custom_objects=None, compile=True):  # pylint
       # Compile model.
       model.compile(**saving_utils.compile_args_from_training_config(
           training_config, custom_objects))
+      saving_utils.try_build_compiled_arguments(model)
 
       # Set optimizer weights.
       if 'optimizer_weights' in f:
diff --git a/tensorflow/python/keras/saving/hdf5_format_test.py b/tensorflow/python/keras/saving/hdf5_format_test.py
index 757385a25ea..b079bf8cac8 100644
--- a/tensorflow/python/keras/saving/hdf5_format_test.py
+++ b/tensorflow/python/keras/saving/hdf5_format_test.py
@@ -26,10 +26,12 @@ from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python import keras
+from tensorflow.python import tf2
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.keras import combinations
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import optimizers
@@ -368,48 +370,54 @@ class TestWeightSavingAndLoading(test.TestCase, parameterized.TestCase):
 
 
 @keras_parameterized.run_with_all_saved_model_formats
-class TestWholeModelSaving(test.TestCase, parameterized.TestCase):
+class TestWholeModelSaving(keras_parameterized.TestCase):
 
   def _save_model_dir(self, dirname='saved_model'):
     temp_dir = self.get_temp_dir()
     self.addCleanup(shutil.rmtree, temp_dir, ignore_errors=True)
     return os.path.join(temp_dir, dirname)
 
-  def _assert_same_weights(self, model, loaded_model,
-                           original_optimizer_has_iterations_variable=True):
-    """Checks that the loaded weighs are the same as the original weights.
+  def _assert_same_weights_and_metrics(self, model, loaded_model):
+    """Checks that the loaded weights and metrics are the same as the original.
 
     Args:
       model: original model
       loaded_model: loaded model
-      original_optimizer_has_iterations_variable: If the original optimizer
-        uses an iterations variable. The loaded model will have a v2
-        optimizer, which always contains an iterations variable. So when
-        comparing the weights, the first variable in the loaded optimizer
-        weights may need to be ignored.
     """
     self.assertAllClose(model.weights, loaded_model.weights)
+
     if loaded_model.optimizer:
       if testing_utils.get_save_format() == 'tf':
         # TODO(b/153110928): Keras TF format doesn't restore optimizer weights
         # currently.
         return
-      if original_optimizer_has_iterations_variable:
-        self.assertAllClose(model.optimizer.weights,
-                            loaded_model.optimizer.weights)
-      else:
-        self.assertAllClose(model.optimizer.weights,
-                            loaded_model.optimizer.weights[1:])
+      self.assertAllClose(model.optimizer.weights,
+                          loaded_model.optimizer.weights)
 
-  def test_sequential_model_saving(self):
+    # In V1/Graph mode, the model isn't built, so the metrics are not loaded
+    # immediately (requires model to be called on some data before building
+    # metrics).
+    check_metrics = tf2.enabled() and context.executing_eagerly()
+
+    if check_metrics:
+      self.assertAllEqual([m.name for m in model.metrics],
+                          [m.name for m in loaded_model.metrics])
+
+  @keras_parameterized.run_with_all_model_types
+  @keras_parameterized.run_all_keras_modes
+  def test_save_and_load(self):
     saved_model_dir = self._save_model_dir()
     save_format = testing_utils.get_save_format()
 
+    if save_format == 'h5' and testing_utils.get_model_type() == 'subclass':
+      return  # HDF5 format currently does not allow saving classed models.
+
     with self.cached_session():
-      model = keras.models.Sequential()
-      model.add(keras.layers.Dense(2, input_shape=(3,)))
-      model.add(keras.layers.RepeatVector(3))
-      model.add(keras.layers.TimeDistributed(keras.layers.Dense(3)))
+      model = testing_utils.get_model_from_layers(
+          [keras.layers.Dense(2),
+           keras.layers.RepeatVector(3),
+           keras.layers.TimeDistributed(keras.layers.Dense(3))],
+          input_shape=(3,))
       model.compile(
           loss=keras.losses.MSE,
           optimizer=keras.optimizer_v2.rmsprop.RMSprop(lr=0.0001),
@@ -432,43 +440,35 @@ class TestWholeModelSaving(test.TestCase, parameterized.TestCase):
       out = model.predict(x)
       keras.models.save_model(model, saved_model_dir, save_format=save_format)
 
-      new_model = keras.models.load_model(saved_model_dir)
-      self._assert_same_weights(model, new_model)
+      loaded_model = keras.models.load_model(saved_model_dir)
+      self._assert_same_weights_and_metrics(model, loaded_model)
 
-      out2 = new_model.predict(x)
+      out2 = loaded_model.predict(x)
       self.assertAllClose(out, out2, atol=1e-05)
 
-      # test that new updates are the same with both models
-      model.train_on_batch(x, y)
-      new_model.train_on_batch(x, y)
-
       eval_out = model.evaluate(x, y)
-      eval_out2 = new_model.evaluate(x, y)
+      eval_out2 = loaded_model.evaluate(x, y)
       self.assertArrayNear(eval_out, eval_out2, 0.001)
 
-      out = model.predict(x)
-      out2 = new_model.predict(x)
-      # The model has been trained on two batches. So the tolerance is larger.
-      self.assertAllClose(out, out2, atol=0.01)
-
+  @test_util.run_in_graph_and_eager_modes
   def test_sequential_model_saving_without_input_shape(self):
     saved_model_dir = self._save_model_dir()
     save_format = testing_utils.get_save_format()
-    with ops.Graph().as_default(), self.cached_session():
+    with self.cached_session():
       model = keras.models.Sequential()
       model.add(keras.layers.Dense(2))
       model.add(keras.layers.RepeatVector(3))
       model.add(keras.layers.TimeDistributed(keras.layers.Dense(3)))
       model.compile(
           loss=keras.losses.MSE,
-          optimizer=keras.optimizers.RMSprop(lr=0.0001),
+          optimizer='rmsprop',
           metrics=[
               keras.metrics.categorical_accuracy,
-              keras.metrics.CategoricalAccuracy()
+              keras.metrics.CategoricalAccuracy(name='cat_acc')
           ],
           weighted_metrics=[
               keras.metrics.categorical_accuracy,
-              keras.metrics.CategoricalAccuracy()
+              keras.metrics.CategoricalAccuracy(name='cat_acc2')
           ],
           sample_weight_mode='temporal')
       x = np.random.random((1, 3))
@@ -479,12 +479,13 @@ class TestWholeModelSaving(test.TestCase, parameterized.TestCase):
       model.save(saved_model_dir, save_format=save_format)
 
       new_model = keras.models.load_model(saved_model_dir)
-      self._assert_same_weights(
-          model, new_model, original_optimizer_has_iterations_variable=False)
+
+      self._assert_same_weights_and_metrics(model, new_model)
 
       out2 = new_model.predict(x)
       self.assertAllClose(out, out2, atol=1e-05)
 
+  @test_util.run_in_graph_and_eager_modes
   def test_sequential_model_saving_without_compile(self):
     saved_model_dir = self._save_model_dir()
     save_format = testing_utils.get_save_format()
@@ -501,7 +502,7 @@ class TestWholeModelSaving(test.TestCase, parameterized.TestCase):
       keras.models.save_model(model, saved_model_dir, save_format=save_format)
 
       new_model = keras.models.load_model(saved_model_dir)
-      self._assert_same_weights(model, new_model)
+      self._assert_same_weights_and_metrics(model, new_model)
 
       out2 = new_model.predict(x)
       self.assertAllClose(out, out2, atol=1e-05)
@@ -535,42 +536,11 @@ class TestWholeModelSaving(test.TestCase, parameterized.TestCase):
           saved_model_dir,
           custom_objects={'CustomOp': CustomOp,
                           'custom_loss': custom_loss})
-      self._assert_same_weights(model, new_model)
+      self._assert_same_weights_and_metrics(model, new_model)
 
       out2 = new_model.predict(x)
       self.assertAllClose(out, out2, atol=1e-05)
 
-  def test_functional_model_saving(self):
-    saved_model_dir = self._save_model_dir()
-    save_format = testing_utils.get_save_format()
-    with ops.Graph().as_default(), self.cached_session():
-      inputs = keras.layers.Input(shape=(3,))
-      x = keras.layers.Dense(2)(inputs)
-      output = keras.layers.Dense(3)(x)
-
-      model = keras.models.Model(inputs, output)
-      model.compile(
-          loss=keras.losses.MSE,
-          optimizer=keras.optimizers.RMSprop(lr=0.0001),
-          metrics=[
-              keras.metrics.categorical_accuracy,
-              keras.metrics.CategoricalAccuracy()
-          ],
-          weighted_metrics=[
-              keras.metrics.categorical_accuracy,
-              keras.metrics.CategoricalAccuracy()
-          ])
-      x = np.random.random((1, 3))
-      y = np.random.random((1, 3))
-      model.train_on_batch(x, y)
-
-      out = model.predict(x)
-      keras.models.save_model(model, saved_model_dir, save_format=save_format)
-      model = keras.models.load_model(saved_model_dir)
-
-      out2 = model.predict(x)
-      self.assertAllClose(out, out2, atol=1e-05)
-
   def test_saving_without_compilation(self):
     saved_model_dir = self._save_model_dir()
     save_format = testing_utils.get_save_format()
diff --git a/tensorflow/python/keras/saving/saved_model/layer_serialization.py b/tensorflow/python/keras/saving/saved_model/layer_serialization.py
index 559b6158d87..4216457bf28 100644
--- a/tensorflow/python/keras/saving/saved_model/layer_serialization.py
+++ b/tensorflow/python/keras/saving/saved_model/layer_serialization.py
@@ -159,6 +159,12 @@ class RNNSavedModelSaver(LayerSavedModelSaver):
     objects, functions = (
         super(RNNSavedModelSaver, self)._get_serialized_attributes_internal(
             serialization_cache))
-
-    objects['states'] = data_structures.wrap_or_unwrap(self.obj.states)
+    states = data_structures.wrap_or_unwrap(self.obj.states)
+    # Force the tuple into TupleWrapper which is a trackable object. The
+    # save/load code requires all the objects to be trackable.
+    # Tuple is not converted to TupleWrapper by data_structures.wrap_or_unwrap()
+    # if it doesn't contains any trackable objects.
+    if isinstance(states, tuple):
+      states = data_structures._TupleWrapper(states)  # pylint: disable=protected-access
+    objects['states'] = states
     return objects, functions
diff --git a/tensorflow/python/keras/saving/saved_model/load.py b/tensorflow/python/keras/saving/saved_model/load.py
index a378c1b98e7..0b55e30c27b 100644
--- a/tensorflow/python/keras/saving/saved_model/load.py
+++ b/tensorflow/python/keras/saving/saved_model/load.py
@@ -129,6 +129,7 @@ def load(path, compile=True, options=None):  # pylint: disable=redefined-builtin
     if training_config is not None:
       model.compile(**saving_utils.compile_args_from_training_config(
           training_config))
+      saving_utils.try_build_compiled_arguments(model)
     else:
       logging.warning('No training configuration found in save file, so the '
                       'model was *not* compiled. Compile it manually.')
diff --git a/tensorflow/python/keras/saving/saved_model/saved_model_test.py b/tensorflow/python/keras/saving/saved_model/saved_model_test.py
index 8d4d27e2357..3f55d5f40b5 100644
--- a/tensorflow/python/keras/saving/saved_model/saved_model_test.py
+++ b/tensorflow/python/keras/saving/saved_model/saved_model_test.py
@@ -773,6 +773,26 @@ class TestModelSavingAndLoadingV2(keras_parameterized.TestCase):
     self.assertAllClose(layer.states, loaded_layer.states)
     self.assertAllClose(model(input_arr), loaded(input_arr))
 
+  def testSaveStatelessConvLSTM2D(self):
+    data_format = 'channels_first'
+    batch, timesteps, channels, rows, cols = 12, 10, 8, 4, 4
+    input_arr = np.ones(
+        (batch, timesteps, channels, rows, cols)).astype('float32')
+    layer = keras.layers.ConvLSTM2D(
+        filters=16, kernel_size=(1, 1), data_format=data_format)
+    x = keras.Input(batch_shape=(batch, timesteps, channels, rows, cols))
+    y = layer(x)
+    model = keras.Model(x, y)
+
+    predict_1 = model(input_arr)
+    saved_model_dir = self._save_model_dir()
+    tf_save.save(model, saved_model_dir)
+    del model
+
+    loaded = keras_load.load(saved_model_dir)
+    predict_2 = loaded(input_arr)
+    self.assertAllClose(predict_1, predict_2)
+
   def testSaveWithRaggedInputs(self):
 
     class EmbeddingMerger(keras.layers.Layer):
diff --git a/tensorflow/python/keras/saving/saving_utils.py b/tensorflow/python/keras/saving/saving_utils.py
index 3c9c33531bf..9fdf81cae2a 100644
--- a/tensorflow/python/keras/saving/saving_utils.py
+++ b/tensorflow/python/keras/saving/saving_utils.py
@@ -27,6 +27,7 @@ from tensorflow.python.keras import losses
 from tensorflow.python.keras import optimizers
 from tensorflow.python.keras.engine import base_layer_utils
 from tensorflow.python.keras.utils import generic_utils
+from tensorflow.python.keras.utils import version_utils
 from tensorflow.python.keras.utils.io_utils import ask_to_proceed_with_overwrite
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import nest
@@ -307,3 +308,16 @@ def _enforce_names_consistency(specs):
   if name_inconsistency:
     specs = nest.map_structure(_clear_name, specs)
   return specs
+
+
+def try_build_compiled_arguments(model):
+  if (not version_utils.is_v1_layer_or_model(model) and
+      model.outputs is not None):
+    try:
+      model.compiled_loss.build(model.outputs)
+      model.compiled_metrics.build(model.outputs, model.outputs)
+    except:  # pylint: disable=bare-except
+      logging.warning(
+          'Compiled the loaded model, but the compiled metrics have yet to '
+          'be built. `model.compile_metrics` will be empty until you train '
+          'or evaluate the model.')
diff --git a/tensorflow/python/keras/testing_utils.py b/tensorflow/python/keras/testing_utils.py
index 1928588fea1..cceaabe37a5 100644
--- a/tensorflow/python/keras/testing_utils.py
+++ b/tensorflow/python/keras/testing_utils.py
@@ -94,7 +94,8 @@ def layer_test(layer_cls,
                expected_output_shape=None,
                validate_training=True,
                adapt_data=None,
-               custom_objects=None):
+               custom_objects=None,
+               test_harness=None):
   """Test routine for a layer with a single input and single output.
 
   Arguments:
@@ -114,6 +115,8 @@ def layer_test(layer_cls,
       be tested for this layer. This is only relevant for PreprocessingLayers.
     custom_objects: Optional dictionary mapping name strings to custom objects
       in the layer class. This is helpful for testing custom layers.
+    test_harness: The Tensorflow test, if any, that this function is being
+      called in.
 
   Returns:
     The output data (Numpy array) returned by the layer, for additional
@@ -143,9 +146,15 @@ def layer_test(layer_cls,
     expected_output_dtype = input_dtype
 
   if dtypes.as_dtype(expected_output_dtype) == dtypes.string:
-    assert_equal = string_test
+    if test_harness:
+      assert_equal = test_harness.assertAllEqual
+    else:
+      assert_equal = string_test
   else:
-    assert_equal = numeric_test
+    if test_harness:
+      assert_equal = test_harness.assertAllClose
+    else:
+      assert_equal = numeric_test
 
   # instantiation
   kwargs = kwargs or {}
@@ -228,6 +237,7 @@ def layer_test(layer_cls,
   # test training mode (e.g. useful for dropout tests)
   # Rebuild the model to avoid the graph being reused between predict() and
   # See b/120160788 for more details. This should be mitigated after 2.0.
+  layer_weights = layer.get_weights()  # Get the layer weights BEFORE training.
   if validate_training:
     model = models.Model(x, layer(x))
     if _thread_local_data.run_eagerly is not None:
@@ -252,6 +262,8 @@ def layer_test(layer_cls,
   model = models.Sequential()
   model.add(layers.Input(shape=input_shape[1:], dtype=input_dtype))
   model.add(layer)
+
+  layer.set_weights(layer_weights)
   actual_output = model.predict(input_data)
   actual_output_shape = actual_output.shape
   for expected_dim, actual_dim in zip(computed_output_shape,
diff --git a/tensorflow/python/keras/tests/add_loss_correctness_test.py b/tensorflow/python/keras/tests/add_loss_correctness_test.py
index a19eec75ffb..f99b285489d 100644
--- a/tensorflow/python/keras/tests/add_loss_correctness_test.py
+++ b/tensorflow/python/keras/tests/add_loss_correctness_test.py
@@ -34,6 +34,7 @@ from tensorflow.python.keras import testing_utils
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
+from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training.rmsprop import RMSPropOptimizer
 
 MAE = losses.MeanAbsoluteError
@@ -450,6 +451,19 @@ class TestAddLossCorrectness(keras_parameterized.TestCase):
           'Expected a symbolic Tensors or a callable for the loss value'):
         model.add_loss(model.weights[0])
 
+  @keras_parameterized.run_all_keras_modes
+  def test_add_entropy_loss_on_functional_model(self):
+    inputs = Input(shape=(1,))
+    targets = Input(shape=(1,))
+    outputs = testing_utils.Bias()(inputs)
+    model = Model([inputs, targets], outputs)
+    model.add_loss(losses.binary_crossentropy(targets, outputs))
+    model.compile('sgd', run_eagerly=testing_utils.should_run_eagerly())
+    with test.mock.patch.object(logging, 'warning') as mock_log:
+      model.fit([self.x, self.y], batch_size=3, epochs=5)
+      self.assertNotIn('Gradients do not exist for variables',
+                       str(mock_log.call_args))
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/tests/custom_training_loop_test.py b/tensorflow/python/keras/tests/custom_training_loop_test.py
index 5b3310b2b40..6291933ac99 100644
--- a/tensorflow/python/keras/tests/custom_training_loop_test.py
+++ b/tensorflow/python/keras/tests/custom_training_loop_test.py
@@ -186,7 +186,7 @@ class CustomTrainingLoopTest(keras_parameterized.TestCase):
 
     def train_step(x):
       no_learning_phase_out = model(x)
-      self.assertIsNone(model.layer.training)
+      self.assertFalse(model.layer.training)
       with keras.backend.learning_phase_scope(0):
         inf_learning_phase_out = model(x)
       self.assertEqual(model.layer.training, 0)
diff --git a/tensorflow/python/keras/tests/integration_test.py b/tensorflow/python/keras/tests/integration_test.py
index 8e4d38c1a6a..64a7b694355 100644
--- a/tensorflow/python/keras/tests/integration_test.py
+++ b/tensorflow/python/keras/tests/integration_test.py
@@ -160,10 +160,6 @@ class SequentialIntegrationTest(KerasIntegrationTest):
     model.pop()
     model.add(keras.layers.Dense(y_train.shape[-1], activation='softmax'))
 
-    # TODO(b/134523282): There is an bug with Sequential models, so the model
-    # must be marked as compiled=False to ensure the next compile goes through.
-    model._is_compiled = False
-
     model.compile(
         loss='categorical_crossentropy',
         optimizer=keras.optimizer_v2.adam.Adam(0.005),
diff --git a/tensorflow/python/keras/utils/version_utils.py b/tensorflow/python/keras/utils/version_utils.py
index 551a07d2422..d3796dcbf92 100644
--- a/tensorflow/python/keras/utils/version_utils.py
+++ b/tensorflow/python/keras/utils/version_utils.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.eager import context
 from tensorflow.python.framework import ops
 from tensorflow.python.util import lazy_loader
 
@@ -51,8 +52,8 @@ class ModelVersionSelector(object):
   """Chooses between Keras v1 and v2 Model class."""
 
   def __new__(cls, *args, **kwargs):  # pylint: disable=unused-argument
-    eager_enabled = ops.executing_eagerly_outside_functions()
-    cls = swap_class(cls, training.Model, training_v1.Model, eager_enabled)
+    use_v2 = should_use_v2()
+    cls = swap_class(cls, training.Model, training_v1.Model, use_v2)  # pylint: disable=self-cls-assignment
     return super(ModelVersionSelector, cls).__new__(cls)
 
 
@@ -60,8 +61,8 @@ class LayerVersionSelector(object):
   """Chooses between Keras v1 and v2 Layer class."""
 
   def __new__(cls, *args, **kwargs):  # pylint: disable=unused-argument
-    eager_enabled = ops.executing_eagerly_outside_functions()
-    cls = swap_class(cls, base_layer.Layer, base_layer_v1.Layer, eager_enabled)
+    use_v2 = should_use_v2()
+    cls = swap_class(cls, base_layer.Layer, base_layer_v1.Layer, use_v2)  # pylint: disable=self-cls-assignment
     return super(LayerVersionSelector, cls).__new__(cls)
 
 
@@ -69,10 +70,10 @@ class TensorBoardVersionSelector(object):
   """Chooses between Keras v1 and v2 TensorBoard callback class."""
 
   def __new__(cls, *args, **kwargs):  # pylint: disable=unused-argument
-    eager_enabled = ops.executing_eagerly_outside_functions()
+    use_v2 = should_use_v2()
     start_cls = cls
     cls = swap_class(start_cls, callbacks.TensorBoard, callbacks_v1.TensorBoard,
-                     eager_enabled)
+                     use_v2)
     if start_cls == callbacks_v1.TensorBoard and cls == callbacks.TensorBoard:
       # Since the v2 class is not a subclass of the v1 class, __init__ has to
       # be called manually.
@@ -80,19 +81,33 @@ class TensorBoardVersionSelector(object):
     return super(TensorBoardVersionSelector, cls).__new__(cls)
 
 
-def swap_class(cls, v2_cls, v1_cls, eager_enabled):
+def should_use_v2():
+  """Determine if v1 or v2 version should be used."""
+  if context.executing_eagerly():
+    return True
+  elif ops.executing_eagerly_outside_functions():
+    # Check for a v1 `wrap_function` FuncGraph.
+    # Code inside a `wrap_function` is treated like v1 code.
+    graph = ops.get_default_graph()
+    if (getattr(graph, "name", False) and
+        graph.name.startswith("wrapped_function")):
+      return False
+    return True
+
+
+def swap_class(cls, v2_cls, v1_cls, use_v2):
   """Swaps in v2_cls or v1_cls depending on graph mode."""
   if cls == object:
     return cls
 
   if cls in (v2_cls, v1_cls):
-    if eager_enabled:
+    if use_v2:
       return v2_cls
     return v1_cls
 
   # Recursively search superclasses to swap in the right Keras class.
   cls.__bases__ = tuple(
-      swap_class(base, v2_cls, v1_cls, eager_enabled) for base in cls.__bases__)
+      swap_class(base, v2_cls, v1_cls, use_v2) for base in cls.__bases__)
   return cls
 
 
diff --git a/tensorflow/python/kernel_tests/BUILD b/tensorflow/python/kernel_tests/BUILD
index 846c582737f..448fa45835b 100644
--- a/tensorflow/python/kernel_tests/BUILD
+++ b/tensorflow/python/kernel_tests/BUILD
@@ -763,10 +763,9 @@ cuda_py_test(
 )
 
 cuda_py_test(
-    name = "matrix_triangular_solve_op_test",
-    size = "medium",
-    srcs = ["matrix_triangular_solve_op_test.py"],
-    shard_count = 3,
+    name = "banded_triangular_solve_op_test",
+    size = "small",
+    srcs = ["banded_triangular_solve_op_test.py"],
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:linalg_ops",
@@ -775,20 +774,14 @@ cuda_py_test(
 )
 
 cuda_py_test(
-    name = "parameterized_truncated_normal_op_test",
+    name = "matrix_triangular_solve_op_test",
     size = "medium",
-    srcs = ["parameterized_truncated_normal_op_test.py"],
+    srcs = ["matrix_triangular_solve_op_test.py"],
+    shard_count = 3,
     deps = [
-        "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:client",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:platform",
-        "//tensorflow/python:random_ops",
+        "//tensorflow/python:linalg_ops",
         "//third_party/py/numpy",
-        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -2996,7 +2989,6 @@ cuda_py_test(
     name = "pooling_ops_3d_test",
     size = "medium",
     srcs = ["pooling_ops_3d_test.py"],
-    tags = ["no_rocm"],
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
diff --git a/tensorflow/python/kernel_tests/banded_triangular_solve_op_test.py b/tensorflow/python/kernel_tests/banded_triangular_solve_op_test.py
new file mode 100644
index 00000000000..bd0fdae03c5
--- /dev/null
+++ b/tensorflow/python/kernel_tests/banded_triangular_solve_op_test.py
@@ -0,0 +1,232 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for tensorflow.ops.math_ops.banded_triangular_solve."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import linalg_ops
+from tensorflow.python.platform import test
+
+
+class BandedTriangularSolveOpTest(test.TestCase):
+
+  def _verifySolveAllWays(self, x, y, dtypes, batch_dims=None):
+    for lower in (False,):
+      for adjoint in (False, True):
+        for use_placeholder in True, False:
+          self._verifySolve(
+              x,
+              y,
+              lower=lower,
+              adjoint=adjoint,
+              batch_dims=batch_dims,
+              use_placeholder=use_placeholder,
+              dtypes=dtypes)
+
+  def _verifySolveAllWaysReal(self, x, y, batch_dims=None):
+    self._verifySolveAllWays(x, y, (np.float32, np.float64), batch_dims)
+
+  def _verifySolveAllWaysComplex(self, x, y, batch_dims=None):
+    self._verifySolveAllWays(x, y, (np.complex64, np.complex128), batch_dims)
+
+  def _verifySolve(self,
+                   x,
+                   y,
+                   lower=True,
+                   adjoint=False,
+                   batch_dims=None,
+                   use_placeholder=False,
+                   dtypes=(np.float32, np.float64)):
+    for np_type in dtypes:
+      a = x.astype(np_type)
+      b = y.astype(np_type)
+
+      # Now we need to convert a to a dense triangular matrix.
+      def make_diags(diags, lower=True):
+        n = len(diags[0])
+        a = np.zeros(n * n, dtype=diags.dtype)
+        if lower:
+          for i, diag in enumerate(diags):
+            a[n * i:n * n:n + 1] = diag[i:]
+        else:
+          diags_flip = np.flip(diags, 0)
+          for i, diag in enumerate(diags_flip):
+            a[i:(n - i) * n:n + 1] = diag[:(n - i)]
+        return a.reshape(n, n)
+
+      # For numpy.solve we have to explicitly zero out the strictly
+      # upper or lower triangle.
+      if a.size > 0:
+        a_np = make_diags(a, lower=lower)
+      else:
+        a_np = a
+      if adjoint:
+        a_np = np.conj(np.transpose(a_np))
+
+      if batch_dims is not None:
+        a = np.tile(a, batch_dims + [1, 1])
+        a_np = np.tile(a_np, batch_dims + [1, 1])
+        b = np.tile(b, batch_dims + [1, 1])
+
+      with self.cached_session(use_gpu=True):
+        a_tf = a
+        b_tf = b
+        if use_placeholder:
+          a_tf = array_ops.placeholder_with_default(a_tf, shape=None)
+          b_tf = array_ops.placeholder_with_default(b_tf, shape=None)
+        tf_ans = linalg_ops.banded_triangular_solve(
+            a_tf, b_tf, lower=lower, adjoint=adjoint)
+        tf_val = self.evaluate(tf_ans)
+        np_ans = np.linalg.solve(a_np, b)
+        self.assertEqual(np_ans.shape, tf_val.shape)
+        self.assertAllClose(np_ans, tf_val)
+
+  @test_util.run_deprecated_v1
+  def testSolve(self):
+    # 1x1 matrix, single rhs.
+    matrix = np.array([[0.1]])
+    rhs0 = np.array([[1.]])
+    self._verifySolveAllWaysReal(matrix, rhs0)
+    # 2x2 matrix with 2 bands, single right-hand side.
+    # Corresponds to the lower triangular
+    # [[1., 0.], [3., 4.]]
+    # and upper triangular
+    # [[2., 1.], [0., 3.]]
+    matrix = np.array([[1., 4.], [2., 3.]])
+    rhs0 = np.array([[1.], [1.]])
+    self._verifySolveAllWaysReal(matrix, rhs0)
+    # 2x2 matrix with 2 bands, 3 right-hand sides.
+    rhs1 = np.array([[1., 0., 1.], [0., 1., 1.]])
+    self._verifySolveAllWaysReal(matrix, rhs1)
+    # 4 x 4 matrix with 2 bands, 3 right hand sides.
+    # Corresponds to the lower triangular
+    # [[1.,  0., 0., 0.],
+    #  [-1., 2., 0., 0.],
+    #  [0., -2., 3., 0.],
+    #  [0., 0., -3., 4.]]
+    # and upper triangular
+    # [[1.,  1., 0., 0.],
+    #  [0., -1., 2., 0.],
+    #  [0., 0., -2., 3.],
+    #  [0., 0., 0., -3.]]
+    matrix = np.array([[1., 2., 3., 4.], [1., -1., -2., -3.]])
+    rhs0 = np.array([[1., 0., 1.], [0., 1., 1.], [-1., 2., 1.], [0., -1., -1.]])
+    self._verifySolveAllWaysReal(matrix, rhs0)
+
+  def testSolveBandSizeSmaller(self):
+    rhs0 = np.random.randn(6, 4)
+
+    # 6 x 6 matrix with 2 bands. Ensure all non-zero entries.
+    matrix = 2. * np.random.uniform(size=[3, 6]) + 1.
+    self._verifySolveAllWaysReal(matrix, rhs0)
+
+    # 6 x 6 matrix with 3 bands. Ensure all non-zero entries.
+    matrix = 2. * np.random.uniform(size=[3, 6]) + 1.
+    self._verifySolveAllWaysReal(matrix, rhs0)
+
+  @test_util.run_deprecated_v1
+  def testSolveComplex(self):
+    if test.is_built_with_rocm():
+      self.skipTest("ROCm does not support BLAS operations for complex types")
+    # 1x1 matrix, single rhs.
+    matrix = np.array([[0.1 + 1j * 0.1]])
+    rhs0 = np.array([[1. + 1j]])
+    self._verifySolveAllWaysComplex(matrix, rhs0)
+    # 2x2 matrix with 2 bands, single right-hand side.
+    # Corresponds to
+    # [[1. + 1j, 0.], [4 + 1j, 2 + 1j]]
+    matrix = np.array([[1., 2.], [3., 4.]]).astype(np.complex64)
+    matrix += 1j * matrix
+    rhs0 = np.array([[1.], [1.]]).astype(np.complex64)
+    rhs0 += 1j * rhs0
+    self._verifySolveAllWaysComplex(matrix, rhs0)
+    # 2x2 matrix with 2 bands, 3 right-hand sides.
+    rhs1 = np.array([[1., 0., 1.], [0., 1., 1.]]).astype(np.complex64)
+    rhs1 += 1j * rhs1
+    self._verifySolveAllWaysComplex(matrix, rhs1)
+
+  @test_util.run_deprecated_v1
+  def testSolveBatch(self):
+    matrix = np.array([[1., 2.], [3., 4.]])
+    rhs = np.array([[1., 0., 1.], [0., 1., 1.]])
+    # Batch of 2x3x2x2 matrices, 2x3x2x3 right-hand sides.
+    self._verifySolveAllWaysReal(matrix, rhs, batch_dims=[2, 3])
+    # Batch of 3x2x2x2 matrices, 3x2x2x3 right-hand sides.
+    self._verifySolveAllWaysReal(matrix, rhs, batch_dims=[3, 2])
+
+    matrix = np.array([[1., 2., 3., 4.], [-1., -2., -3., -4.],
+                       [-1., 1., 2., 3.]])
+    rhs = np.array([[-1., 2.], [1., 1.], [0., 1.], [2., 3.]])
+    # Batch of 2x3x4x4 matrices with 3 bands, 2x3x4x2 right-hand sides.
+    self._verifySolveAllWaysReal(matrix, rhs, batch_dims=[2, 3])
+    # Batch of 3x2x4x4 matrices with 3 bands, 3x2x4x2 right-hand sides.
+    self._verifySolveAllWaysReal(matrix, rhs, batch_dims=[3, 2])
+
+  @test_util.run_deprecated_v1
+  def testSolveBatchComplex(self):
+    if test.is_built_with_rocm():
+      self.skipTest("ROCm does not support BLAS operations for complex types")
+    matrix = np.array([[1., 2.], [3., 4.]]).astype(np.complex64)
+    matrix += 1j * matrix
+    rhs = np.array([[1., 0., 1.], [0., 1., 1.]]).astype(np.complex64)
+    rhs += 1j * rhs
+    # Batch of 2x3x2x2 matrices, 2x3x2x3 right-hand sides.
+    self._verifySolveAllWaysComplex(matrix, rhs, batch_dims=[2, 3])
+    # Batch of 3x2x2x2 matrices, 3x2x2x3 right-hand sides.
+    self._verifySolveAllWaysComplex(matrix, rhs, batch_dims=[3, 2])
+
+  @test_util.run_deprecated_v1
+  def testWrongDimensions(self):
+    # The matrix should have the same number of rows as the
+    # right-hand sides.
+    matrix = np.array([[1., 1.], [1., 1.]])
+    rhs = np.array([[1., 0.]])
+    with self.cached_session(use_gpu=True):
+      with self.assertRaises(ValueError):
+        self._verifySolve(matrix, rhs)
+      with self.assertRaises(ValueError):
+        self._verifySolve(matrix, rhs, batch_dims=[2, 3])
+
+    # Number of bands exceeds the dimension of the matrix.
+    matrix = np.ones((6, 4))
+    rhs = np.ones((4, 2))
+    with self.cached_session(use_gpu=True):
+      with self.assertRaises(ValueError):
+        self._verifySolve(matrix, rhs)
+      with self.assertRaises(ValueError):
+        self._verifySolve(matrix, rhs, batch_dims=[2, 3])
+
+  @test_util.run_deprecated_v1
+  @test_util.disable_xla("XLA cannot throw assertion errors during a kernel.")
+  def testNotInvertible(self):
+    # The input should be invertible.
+    # The matrix is singular because it has a zero on the diagonal.
+    # FIXME(rmlarsen): The GPU kernel does not check for singularity.
+    singular_matrix = np.array([[1., 0., -1.], [-1., 0., 1.], [0., -1., 1.]])
+    with self.cached_session():
+      with self.assertRaisesOpError("Input matrix is not invertible."):
+        self._verifySolve(singular_matrix, singular_matrix)
+      with self.assertRaisesOpError("Input matrix is not invertible."):
+        self._verifySolve(singular_matrix, singular_matrix, batch_dims=[2, 3])
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/kernel_tests/cwise_ops_unary_test.py b/tensorflow/python/kernel_tests/cwise_ops_unary_test.py
index 0c6807197e9..df848a653d4 100644
--- a/tensorflow/python/kernel_tests/cwise_ops_unary_test.py
+++ b/tensorflow/python/kernel_tests/cwise_ops_unary_test.py
@@ -33,6 +33,7 @@ from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import gradient_checker_v2
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_grad  # pylint: disable=unused-import
+from tensorflow.python.ops import special_math_ops
 from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging
 
@@ -228,8 +229,8 @@ class UnaryOpTest(test.TestCase):
     self._compareBoth(x, np.vectorize(math.erfc), math_ops.erfc)
     try:
       from scipy import special  # pylint: disable=g-import-not-at-top
-      self._compareBoth(x, special.i0e, math_ops.bessel_i0e)
-      self._compareBoth(x, special.i1e, math_ops.bessel_i1e)
+      self._compareBoth(x, special.i0e, special_math_ops.bessel_i0e)
+      self._compareBoth(x, special.i1e, special_math_ops.bessel_i1e)
     except ImportError as e:
       tf_logging.warn("Cannot test special functions: %s" % str(e))
 
@@ -281,8 +282,8 @@ class UnaryOpTest(test.TestCase):
     self._compareBoth(x, np.arctan, math_ops.atan)
     try:
       from scipy import special  # pylint: disable=g-import-not-at-top
-      self._compareBoth(x, special.i0e, math_ops.bessel_i0e)
-      self._compareBoth(x, special.i1e, math_ops.bessel_i1e)
+      self._compareBoth(x, special.i0e, special_math_ops.bessel_i0e)
+      self._compareBoth(x, special.i1e, special_math_ops.bessel_i1e)
     except ImportError as e:
       tf_logging.warn("Cannot test special functions: %s" % str(e))
 
@@ -335,8 +336,8 @@ class UnaryOpTest(test.TestCase):
     self._compareBoth(k, np.tan, math_ops.tan)
     try:
       from scipy import special  # pylint: disable=g-import-not-at-top
-      self._compareBoth(x, special.i0e, math_ops.bessel_i0e)
-      self._compareBoth(x, special.i1e, math_ops.bessel_i1e)
+      self._compareBoth(x, special.i0e, special_math_ops.bessel_i0e)
+      self._compareBoth(x, special.i1e, special_math_ops.bessel_i1e)
     except ImportError as e:
       tf_logging.warn("Cannot test special functions: %s" % str(e))
 
@@ -375,13 +376,6 @@ class UnaryOpTest(test.TestCase):
         math_ops.lgamma)
     self._compareBoth(x, np.vectorize(math.erf), math_ops.erf)
     self._compareBoth(x, np.vectorize(math.erfc), math_ops.erfc)
-    try:
-      from scipy import special  # pylint: disable=g-import-not-at-top
-      self._compareBoth(x, special.i0e, math_ops.bessel_i0e)
-      self._compareBoth(x, special.i1e, math_ops.bessel_i1e)
-    except ImportError as e:
-      tf_logging.warn("Cannot test special functions: %s" % str(e))
-
     self._compareBothSparse(x, np.abs, math_ops.abs)
     self._compareBothSparse(x, np.negative, math_ops.negative)
     self._compareBothSparse(x, np.square, math_ops.square)
@@ -395,16 +389,22 @@ class UnaryOpTest(test.TestCase):
                   2).reshape(1, 3, 2).astype(dtypes_lib.bfloat16.as_numpy_dtype)
     self._compareCpu(x, np.abs, math_ops.abs)
     self._compareCpu(x, np.abs, _ABS)
+    self._compareBoth(x, np.negative, math_ops.negative)
+    self._compareBoth(x, np.negative, _NEG)
 
   def testInt8Basic(self):
     x = np.arange(-6, 6, 2).reshape(1, 3, 2).astype(np.int8)
     self._compareCpu(x, np.abs, math_ops.abs)
     self._compareCpu(x, np.abs, _ABS)
+    self._compareBoth(x, np.negative, math_ops.negative)
+    self._compareBoth(x, np.negative, _NEG)
 
   def testInt16Basic(self):
     x = np.arange(-6, 6, 2).reshape(1, 3, 2).astype(np.int16)
     self._compareCpu(x, np.abs, math_ops.abs)
     self._compareCpu(x, np.abs, _ABS)
+    self._compareBoth(x, np.negative, math_ops.negative)
+    self._compareBoth(x, np.negative, _NEG)
 
   def testInt32Basic(self):
     x = np.arange(-6, 6, 2).reshape(1, 3, 2).astype(np.int32)
diff --git a/tensorflow/python/kernel_tests/decode_bmp_op_test.py b/tensorflow/python/kernel_tests/decode_bmp_op_test.py
index 5e7991382ed..b0a8f9ffbbb 100644
--- a/tensorflow/python/kernel_tests/decode_bmp_op_test.py
+++ b/tensorflow/python/kernel_tests/decode_bmp_op_test.py
@@ -25,14 +25,14 @@ from tensorflow.python.ops import image_ops
 from tensorflow.python.platform import test
 
 
-
 class DecodeBmpOpTest(test.TestCase):
 
   def testex1(self):
     img_bytes = [[[0, 0, 255], [0, 255, 0]], [[255, 0, 0], [255, 255, 255]]]
     # Encoded BMP bytes from Wikipedia
+    # BMP header bytes: https://en.wikipedia.org/wiki/List_of_file_signatures
     encoded_bytes = [
-        0x42, 0x40,
+        0x42, 0x4d,
         0x46, 0, 0, 0,
         0, 0,
         0, 0,
@@ -66,9 +66,10 @@ class DecodeBmpOpTest(test.TestCase):
 
   def testGrayscale(self):
     img_bytes = [[[255], [0]], [[255], [0]]]
+    # BMP header bytes: https://en.wikipedia.org/wiki/List_of_file_signatures
     encoded_bytes = [
         0x42,
-        0x40,
+        0x4d,
         0x3d,
         0,
         0,
@@ -133,6 +134,8 @@ class DecodeBmpOpTest(test.TestCase):
 
     byte_string = bytes(bytearray(encoded_bytes))
     img_in = constant_op.constant(byte_string, dtype=dtypes.string)
+    # TODO(b/159600494): Currently, `decode_bmp` op does not validate input
+    # magic bytes.
     decode = image_ops.decode_bmp(img_in)
 
     with self.cached_session():
diff --git a/tensorflow/python/kernel_tests/decode_raw_op_test.py b/tensorflow/python/kernel_tests/decode_raw_op_test.py
index ae8d40dbaea..5ed6689e48a 100644
--- a/tensorflow/python/kernel_tests/decode_raw_op_test.py
+++ b/tensorflow/python/kernel_tests/decode_raw_op_test.py
@@ -84,22 +84,22 @@ class DecodeRawOpTest(test.TestCase):
   def testToFloat16(self):
     result = np.matrix([[1, -2, -3, 4]], dtype="<f2")
     self.assertAllEqual(
-        result, parsing_ops.decode_raw([result.tostring()], dtypes.float16))
+        result, parsing_ops.decode_raw([result.tobytes()], dtypes.float16))
 
   def testToBool(self):
     result = np.matrix([[True, False, False, True]], dtype="<b1")
-    self.assertAllEqual(
-        result, parsing_ops.decode_raw([result.tostring()], dtypes.bool))
+    self.assertAllEqual(result,
+                        parsing_ops.decode_raw([result.tobytes()], dtypes.bool))
 
   def testToComplex64(self):
     result = np.matrix([[1 + 1j, 2 - 2j, -3 + 3j, -4 - 4j]], dtype="<c8")
     self.assertAllEqual(
-        result, parsing_ops.decode_raw([result.tostring()], dtypes.complex64))
+        result, parsing_ops.decode_raw([result.tobytes()], dtypes.complex64))
 
   def testToComplex128(self):
     result = np.matrix([[1 + 1j, 2 - 2j, -3 + 3j, -4 - 4j]], dtype="<c16")
     self.assertAllEqual(
-        result, parsing_ops.decode_raw([result.tostring()], dtypes.complex128))
+        result, parsing_ops.decode_raw([result.tobytes()], dtypes.complex128))
 
   def testEmptyStringInput(self):
     for num_inputs in range(3):
diff --git a/tensorflow/python/kernel_tests/linalg_grad_test.py b/tensorflow/python/kernel_tests/linalg_grad_test.py
index 36e58bee829..3aceddf4d5f 100644
--- a/tensorflow/python/kernel_tests/linalg_grad_test.py
+++ b/tensorflow/python/kernel_tests/linalg_grad_test.py
@@ -132,6 +132,44 @@ def _GetMatrixBinaryFunctorGradientTest(functor_,
   return Test
 
 
+def _GetBandedTriangularSolveGradientTest(
+    functor_,
+    dtype_,
+    shape_,
+    float32_tol_fudge=1.0,  # pylint: disable=redefined-outer-name
+    **kwargs_):
+
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
+  def Test(self):
+    n = shape_[-1]
+
+    np.random.seed(1)
+    # Make sure invertible.
+    a_np = np.random.uniform(low=1.0, high=2.0, size=shape_).astype(dtype_)
+    a = constant_op.constant(a_np)
+
+    b_np = np.random.uniform(low=-1.0, high=1.0, size=[n, n]).astype(dtype_)
+    b = constant_op.constant(b_np)
+
+    epsilon = np.finfo(dtype_).eps
+    delta = epsilon**(1.0 / 3.0)
+    # tolerance obtained by looking at actual differences using
+    # np.linalg.norm(theoretical-numerical, np.inf) on -mavx build
+    tol = 1e-6 if dtype_ == np.float64 else float32_tol_fudge * 0.05
+
+    # check gradient w.r.t. left argument.
+    theoretical, numerical = gradient_checker_v2.compute_gradient(
+        lambda x: functor_(x, b, **kwargs_), [a], delta=delta)
+    self.assertAllClose(theoretical, numerical, atol=tol, rtol=tol)
+
+    # check gradient w.r.t. right argument.
+    theoretical, numerical = gradient_checker_v2.compute_gradient(
+        lambda y: functor_(a, y, **kwargs_), [b], delta=delta)
+    self.assertAllClose(theoretical, numerical, atol=tol, rtol=tol)
+
+  return Test
+
+
 if __name__ == '__main__':
   # Tests for gradients of binary matrix operations.
   for dtype in np.float32, np.float64:
@@ -166,6 +204,20 @@ if __name__ == '__main__':
                          adjoint=adjoint,
                          lower=lower))
 
+            band_shape = extra + (size // 2 + 1, size)
+            name = '%s_%s_adj_%s_low_%s' % (dtype.__name__, '_'.join(
+                map(str, band_shape)), str(adjoint), lower)
+            _AddTest(
+                MatrixBinaryFunctorGradientTest,
+                'BandedTriangularSolveGradient', name,
+                _GetBandedTriangularSolveGradientTest(
+                    linalg_ops.banded_triangular_solve,
+                    dtype,
+                    band_shape,
+                    float32_tol_fudge=4.0,
+                    adjoint=adjoint,
+                    lower=lower))
+
   # Tests for gradients of unary matrix operations.
   for dtype in np.float32, np.float64:
     for size in 2, 5, 10:
diff --git a/tensorflow/python/kernel_tests/matrix_band_part_op_test.py b/tensorflow/python/kernel_tests/matrix_band_part_op_test.py
index fdb7e4a1a4e..25b502cf814 100644
--- a/tensorflow/python/kernel_tests/matrix_band_part_op_test.py
+++ b/tensorflow/python/kernel_tests/matrix_band_part_op_test.py
@@ -56,7 +56,7 @@ def _GetMatrixBandPartTest(dtype_, batch_shape_, shape_):
           band_np = np.triu(band_np, -lower)
         if upper >= 0:
           band_np = np.tril(band_np, upper)
-        if batch_shape_ is not ():
+        if batch_shape_ != ():
           band_np = np.tile(band_np, batch_shape_ + (1, 1))
         for index_dtype in [dtypes_lib.int32, dtypes_lib.int64]:
           with self.cached_session(use_gpu=False):
diff --git a/tensorflow/python/kernel_tests/matrix_solve_ls_op_test.py b/tensorflow/python/kernel_tests/matrix_solve_ls_op_test.py
index b7a159e2eff..889ea0dbd6c 100644
--- a/tensorflow/python/kernel_tests/matrix_solve_ls_op_test.py
+++ b/tensorflow/python/kernel_tests/matrix_solve_ls_op_test.py
@@ -107,7 +107,7 @@ class MatrixSolveLsOpTest(test_lib.TestCase):
       np_ans = _SolveWithNumpy(x, y, l2_regularizer=l2_regularizer)
       np_r = np.dot(np.conj(a.T), b - np.dot(a, np_ans))
       np_r_norm = np.sqrt(np.sum(np.conj(np_r) * np_r))
-      if batch_shape is not ():
+      if batch_shape != ():
         a = np.tile(a, batch_shape + (1, 1))
         b = np.tile(b, batch_shape + (1, 1))
         np_ans = np.tile(np_ans, batch_shape + (1, 1))
diff --git a/tensorflow/python/kernel_tests/pool_test.py b/tensorflow/python/kernel_tests/pool_test.py
index 0f0eaa25402..b5544131ee3 100644
--- a/tensorflow/python/kernel_tests/pool_test.py
+++ b/tensorflow/python/kernel_tests/pool_test.py
@@ -65,8 +65,8 @@ def pool_direct_single_axis(
   input_size = input.shape[axis]
   if padding == "SAME":
     output_size = int(math.ceil(input_size / stride))
-    total_padding_amount = max(
-        0, (output_size - 1) * stride + effective_window_size - input_size)
+    total_padding_amount = max(0, (output_size - 1) * stride +
+                               effective_window_size - input_size)
     before_padding = total_padding_amount // 2
   elif padding == "VALID":
     output_size = int(
@@ -219,8 +219,6 @@ class PoolingTest(test.TestCase):
                     strides=strides)
 
   def testPool3D(self):
-    if test.is_built_with_rocm():
-      self.skipTest("Pooling with 3D tensors is not supported in ROCm")
     with self.session(use_gpu=test.is_gpu_available()):
       for padding in ["SAME", "VALID"]:
         for pooling_type in ["MAX", "AVG"]:
@@ -302,8 +300,10 @@ class PoolingTest(test.TestCase):
     x = constant_op.constant(x_val, name="x", dtype=dtypes.float32)
     output = nn_ops.pool(input=x, **kwargs)
     y_shape = output.get_shape().as_list()
-    err = gradient_checker.compute_gradient_error(
-        [x], [input_shape], output, y_shape, x_init_value=[x_val])
+    err = gradient_checker.compute_gradient_error([x], [input_shape],
+                                                  output,
+                                                  y_shape,
+                                                  x_init_value=[x_val])
     err_tolerance = 1e-2
     self.assertLess(err, err_tolerance)
 
@@ -363,8 +363,6 @@ class PoolingTest(test.TestCase):
 
   @test_util.run_deprecated_v1
   def testGradient3D(self):
-    if test.is_built_with_rocm():
-      self.skipTest("Pooling with 3D tensors is not supported in ROCm")
     with self.session(use_gpu=test.is_gpu_available()):
       for padding in ["SAME", "VALID"]:
         for pooling_type in ["AVG", "MAX"]:
diff --git a/tensorflow/python/kernel_tests/random/BUILD b/tensorflow/python/kernel_tests/random/BUILD
index b5d291d2973..6e404b4cd5f 100644
--- a/tensorflow/python/kernel_tests/random/BUILD
+++ b/tensorflow/python/kernel_tests/random/BUILD
@@ -20,6 +20,24 @@ py_library(
     ],
 )
 
+cuda_py_test(
+    name = "parameterized_truncated_normal_op_test",
+    size = "medium",
+    srcs = ["parameterized_truncated_normal_op_test.py"],
+    deps = [
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:client",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:random_ops",
+        "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
 tf_py_test(
     name = "random_shuffle_queue_test",
     size = "small",
diff --git a/tensorflow/python/kernel_tests/parameterized_truncated_normal_op_test.py b/tensorflow/python/kernel_tests/random/parameterized_truncated_normal_op_test.py
similarity index 63%
rename from tensorflow/python/kernel_tests/parameterized_truncated_normal_op_test.py
rename to tensorflow/python/kernel_tests/random/parameterized_truncated_normal_op_test.py
index ac8ad7a2bd4..309c3e404db 100644
--- a/tensorflow/python/kernel_tests/parameterized_truncated_normal_op_test.py
+++ b/tensorflow/python/kernel_tests/random/parameterized_truncated_normal_op_test.py
@@ -27,11 +27,15 @@ from six.moves import range  # pylint: disable=redefined-builtin
 
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.client import session
+from tensorflow.python.eager import backprop
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
 from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import stateless_random_ops as stateless
+from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging
 
@@ -91,13 +95,8 @@ class TruncatedNormalMoments(object):
 
 def calculate_moments(samples, max_moment):
   moments = [0.0] * (max_moment + 1)
-  for sample in samples:
-    value = 1.0
-    for k in range(len(moments)):
-      moments[k] += value
-      value *= sample
-  for i in range(len(moments)):
-    moments[i] /= len(samples)
+  for k in range(len(moments)):
+    moments[k] = np.mean(samples**k, axis=0)
   return moments
 
 
@@ -118,16 +117,31 @@ class ParameterizedTruncatedNormalTest(test.TestCase):
   # Stop at moment 10 to avoid numerical errors in the theoretical moments.
   max_moment = 10
 
-  def validateMoments(self, shape, mean, stddev, minval, maxval, seed=1618):
+  def validateMoments(self,
+                      shape,
+                      mean,
+                      stddev,
+                      minval,
+                      maxval,
+                      use_stateless=False,
+                      seed=1618):
     try:
       # TruncatedNormalMoments requires scipy.stats.
       # Give up early if we are unable to import it.
-      import scipy.stats  # pylint: disable=g-import-not-at-top,unused-variable
       random_seed.set_random_seed(seed)
       with self.cached_session(use_gpu=True):
-        samples = random_ops.parameterized_truncated_normal(shape, mean, stddev,
-                                                            minval,
-                                                            maxval).eval()
+        if use_stateless:
+          # Generate a seed that stateless ops can use.
+          new_seed = random_ops.random_uniform([2],
+                                               seed=seed,
+                                               minval=0,
+                                               maxval=(2**31 - 1),
+                                               dtype=np.int32)
+          samples = stateless.stateless_parameterized_truncated_normal(
+              shape, new_seed, mean, stddev, minval, maxval).eval()
+        else:
+          samples = random_ops.parameterized_truncated_normal(
+              shape, mean, stddev, minval, maxval).eval()
         assert (~np.isnan(samples)).all()
       moments = calculate_moments(samples, self.max_moment)
       expected_moments = TruncatedNormalMoments(mean, stddev, minval, maxval)
@@ -144,14 +158,24 @@ class ParameterizedTruncatedNormalTest(test.TestCase):
                                 stddev,
                                 minval,
                                 maxval,
+                                use_stateless=False,
                                 seed=1618):
     try:
       import scipy.stats  # pylint: disable=g-import-not-at-top
       random_seed.set_random_seed(seed)
       with self.cached_session(use_gpu=True):
-        samples = random_ops.parameterized_truncated_normal(shape, mean, stddev,
-                                                            minval,
-                                                            maxval).eval()
+        if use_stateless:
+          new_seed = random_ops.random_uniform([2],
+                                               seed=seed,
+                                               minval=0,
+                                               maxval=(2**31 - 1),
+                                               dtype=np.int32)
+          samples = stateless.stateless_parameterized_truncated_normal(
+              shape, new_seed, mean, stddev, minval, maxval).eval()
+        else:
+          samples = random_ops.parameterized_truncated_normal(
+              shape, mean, stddev, minval, maxval).eval()
+
       assert (~np.isnan(samples)).all()
       minval = max(mean - stddev * 10, minval)
       maxval = min(mean + stddev * 10, maxval)
@@ -169,61 +193,160 @@ class ParameterizedTruncatedNormalTest(test.TestCase):
 
   @test_util.run_deprecated_v1
   def testDefaults(self):
-    self.validateMoments([10**5], 0.0, 1.0, -2.0, 2.0)
+    self.validateMoments([int(1e5)], 0.0, 1.0, -2.0, 2.0)
+    self.validateMoments([int(1e5)], 0.0, 1.0, -2.0, 2.0, use_stateless=True)
 
   @test_util.run_deprecated_v1
   def testShifted(self):
-    self.validateMoments([10**5], -1.0, 1.0, -2.0, 2.0)
+    self.validateMoments([int(1e5)], -1.0, 1.0, -2.0, 2.0)
+    self.validateMoments([int(1e5)], -1.0, 1.0, -2.0, 2.0, use_stateless=True)
 
   @test_util.run_deprecated_v1
   def testRightTail(self):
-    self.validateMoments([10**5], 0.0, 1.0, 4.0, np.infty)
+    self.validateMoments([int(1e5)], 0.0, 1.0, 4.0, np.infty)
+    self.validateMoments([int(1e5)],
+                         0.0,
+                         1.0,
+                         4.0,
+                         np.infty,
+                         use_stateless=True)
 
   @test_util.run_deprecated_v1
   def testLeftTail(self):
-    self.validateMoments([10**5], 0.0, 1.0, -np.infty, -4.0)
+    self.validateMoments([int(1e5)], 0.0, 1.0, -np.infty, -4.0)
+    self.validateMoments([int(1e5)],
+                         0.0,
+                         1.0,
+                         -np.infty,
+                         -4.0,
+                         use_stateless=True)
 
   @test_util.run_deprecated_v1
   def testLeftTailTwoSidedBounds(self):
-    self.validateMoments([10**5], 0.0, 1.0, -6.0, -3.0)
+    self.validateMoments([int(1e5)], 0.0, 1.0, -6.0, -3.0)
+    self.validateMoments([int(1e5)], 0.0, 1.0, -6.0, -3.0, use_stateless=True)
 
   @test_util.run_deprecated_v1
   @test_util.disable_xla("Low probability region")
   def testTwoSidedLeftTailShifted(self):
-    self.validateKolmogorovSmirnov([10**5], 6.0, 1.0, -1.0, 1.0)
+    self.validateKolmogorovSmirnov([int(1e5)], 6.0, 1.0, -1.0, 1.0)
+    self.validateKolmogorovSmirnov([int(1e5)],
+                                   6.0,
+                                   1.0,
+                                   -1.0,
+                                   1.0,
+                                   use_stateless=True)
 
   @test_util.run_deprecated_v1
   @test_util.disable_xla("Low probability region")
   def testRightTailShifted(self):
-    self.validateMoments([10**5], -5.0, 1.0, 2.0, np.infty)
+    self.validateMoments([int(1e5)], -5.0, 1.0, 2.0, np.infty)
+    self.validateMoments([int(1e5)],
+                         -5.0,
+                         1.0,
+                         2.0,
+                         np.infty,
+                         use_stateless=True)
 
   # Take the normal distribution around the mean, but truncating the left tail
   # far from the mean.
   @test_util.run_deprecated_v1
   def testTruncateOnLeft_entireTailOnRight(self):
-    self.validateKolmogorovSmirnov([10**5], 10.0, 1.0, 4.0, np.infty)
+    self.validateKolmogorovSmirnov([int(1e5)], 10.0, 1.0, 4.0, np.infty)
+    self.validateKolmogorovSmirnov([int(1e5)],
+                                   10.0,
+                                   1.0,
+                                   4.0,
+                                   np.infty,
+                                   use_stateless=True)
 
   # Take the normal distribution around the mean, but truncating the right tail.
   @test_util.run_deprecated_v1
   def testTruncateOnRight_entireTailOnLeft(self):
-    self.validateKolmogorovSmirnov([10**5], -8, 1.0, -np.infty, -4.0)
+    self.validateKolmogorovSmirnov([int(1e5)], -8, 1.0, -np.infty, -4.0)
+    self.validateKolmogorovSmirnov([int(1e5)],
+                                   -8.,
+                                   1.0,
+                                   -np.infty,
+                                   -4.0,
+                                   use_stateless=True)
 
   @test_util.run_deprecated_v1
   def testSmallStddev(self):
-    self.validateKolmogorovSmirnov([10**5], 0.0, 0.1, 0.05, 0.10)
+    self.validateKolmogorovSmirnov([int(1e5)], 0.0, 0.1, 0.05, 0.10)
+    self.validateKolmogorovSmirnov([int(1e5)],
+                                   0.0,
+                                   0.1,
+                                   0.05,
+                                   0.10,
+                                   use_stateless=True)
 
   @test_util.run_deprecated_v1
   def testSamplingWithSmallStdDevFarFromBound(self):
     sample_op = random_ops.parameterized_truncated_normal(
         shape=(int(1e5),), means=0.8, stddevs=0.05, minvals=-1., maxvals=1.)
+    new_seed = random_ops.random_uniform([2],
+                                         seed=1234,
+                                         minval=0,
+                                         maxval=(2**31 - 1),
+                                         dtype=np.int32)
+    sample_op_stateless = stateless.stateless_parameterized_truncated_normal(
+        shape=(int(1e5),),
+        seed=new_seed,
+        means=0.8,
+        stddevs=0.05,
+        minvals=-1.,
+        maxvals=1.)
 
     with self.session(use_gpu=True) as sess:
-      samples = sess.run(sample_op)
+      samples, samples_stateless = sess.run([sample_op, sample_op_stateless])
       # 0. is more than 16 standard deviations from the mean, and
       # should have a likelihood < 1e-57.
       assert (~np.isnan(samples)).all()
-      no_neg_samples = np.sum(samples < 0.)
-      self.assertEqual(no_neg_samples, 0.)
+      assert (~np.isnan(samples_stateless)).all()
+      self.assertAllGreater(samples, 0.)
+      self.assertAllGreater(samples_stateless, 0.)
+
+  def testStatelessParameterizedTruncatedNormalHasGrads(self):
+    mean = variables.Variable(0.01)
+    stddev = variables.Variable(1.)
+    minval = variables.Variable(-1.)
+    maxval = variables.Variable(1.)
+
+    with self.cached_session(use_gpu=True) as sess:
+      with backprop.GradientTape(persistent=True) as tape:
+        samples = stateless.stateless_parameterized_truncated_normal(
+            [1], [1, 2], mean, stddev, minval, maxval)
+
+      sess.run(variables.variables_initializer([mean, stddev, minval, maxval]))
+      [mean_grad, std_grad], mean_actual_grad, std_actual_grad = sess.run([
+          tape.gradient(samples, [mean, stddev]),
+          array_ops.ones_like(mean),
+          (samples - mean) / stddev])
+      self.assertAllClose(mean_grad, mean_actual_grad)
+      self.assertAllClose(std_grad, std_actual_grad[0])
+
+      try:
+        import scipy.stats  # pylint:disable=g-import-not-at-top
+        truncnorm = scipy.stats.truncnorm(a=-1., b=1., loc=0., scale=1.)
+        samples_np, [minval_grad, maxval_grad] = sess.run([
+            samples, tape.gradient(samples, [minval, maxval])])
+
+        sample_cdf = truncnorm.cdf(samples_np)
+        # These come from the implicit reparameterization trick.
+        scipy_maxval_grad = np.exp(
+            0.5 * (samples_np ** 2 - ((1. - 0.01) / 1.) ** 2) +
+            np.log(sample_cdf))
+
+        scipy_minval_grad = np.exp(
+            0.5 * (samples_np ** 2 - ((-1. - 0.01) / 1.) ** 2) +
+            np.log1p(-sample_cdf))
+
+        self.assertAllClose(minval_grad, scipy_minval_grad[0], rtol=1e-2)
+        self.assertAllClose(maxval_grad, scipy_maxval_grad[0], rtol=1e-2)
+
+      except ImportError as e:
+        tf_logging.warn("Cannot test truncated normal op: %s" % str(e))
 
   @test_util.run_deprecated_v1
   def testSamplingAtRandnSwitchover(self):
@@ -239,18 +362,33 @@ class ParameterizedTruncatedNormalTest(test.TestCase):
 
     epsilon = 0.001
     self.validateMoments(
-        shape=[10**6],
+        shape=[int(1e6)],
         mean=0.,
         stddev=1.0,
         minval=-epsilon,
         maxval=stddev_inside_bounds_before_using_randn - epsilon)
     self.validateMoments(
-        shape=[10**6],
+        shape=[int(1e6)],
         mean=0.,
         stddev=1.0,
         minval=-epsilon,
         maxval=stddev_inside_bounds_before_using_randn + epsilon)
 
+    self.validateMoments(
+        shape=[int(1e6)],
+        mean=0.,
+        stddev=1.0,
+        minval=-epsilon,
+        maxval=stddev_inside_bounds_before_using_randn - epsilon,
+        use_stateless=True)
+    self.validateMoments(
+        shape=[int(1e6)],
+        mean=0.,
+        stddev=1.0,
+        minval=-epsilon,
+        maxval=stddev_inside_bounds_before_using_randn + epsilon,
+        use_stateless=True)
+
 
 # Benchmarking code
 def parameterized_vs_naive(shape, num_iters, use_gpu=False):
diff --git a/tensorflow/python/kernel_tests/resource_variable_ops_test.py b/tensorflow/python/kernel_tests/resource_variable_ops_test.py
index b45e9dfb2bc..fb172fbcb10 100644
--- a/tensorflow/python/kernel_tests/resource_variable_ops_test.py
+++ b/tensorflow/python/kernel_tests/resource_variable_ops_test.py
@@ -30,7 +30,6 @@ from tensorflow.core.framework import tensor_pb2
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
-from tensorflow.python.framework import config
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import cpp_shape_inference_pb2
 from tensorflow.python.framework import dtypes
@@ -1513,41 +1512,5 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase,
     self.assertAllEqual(expected, result)
 
 
-class PerReplicaResourceHandleTest(test_util.TensorFlowTestCase):
-
-  def setUp(self):
-    super(PerReplicaResourceHandleTest, self).setUp()
-    cpus = config.list_physical_devices("CPU")
-    # Set 2 virtual CPUs
-    config.set_logical_device_configuration(cpus[0], [
-        context.LogicalDeviceConfiguration(),
-        context.LogicalDeviceConfiguration(),
-    ])
-
-  @test_util.disable_tfrt("Multiple device support. b/154956430")
-  def testAllowedDevices(self):
-    device0 = "/job:localhost/replica:0/task:0/device:CPU:0"
-    device1 = "/job:localhost/replica:0/task:0/device:CPU:1"
-    value0 = 1
-    value1 = 2
-    with context.eager_mode():
-      handle = resource_variable_ops.var_handle_op(
-          dtype=dtypes.int32, shape=[], allowed_devices=[device0, device1])
-      with ops.device(device0):
-        assign0 = resource_variable_ops.assign_variable_op(handle, value0)
-      with ops.device(device1):
-        assign1 = resource_variable_ops.assign_variable_op(handle, value1)
-      with ops.control_dependencies([assign0, assign1]):
-        with ops.device(device0):
-          read0 = resource_variable_ops.read_variable_op(
-              handle, dtype=dtypes.int32)
-        with ops.device(device1):
-          read1 = resource_variable_ops.read_variable_op(
-              handle, dtype=dtypes.int32)
-
-      self.assertAllEqual(value0, read0)
-      self.assertAllEqual(value1, read1)
-
-
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/ops/check_ops.py b/tensorflow/python/ops/check_ops.py
index 680796df48d..1e4b3ca1bb4 100644
--- a/tensorflow/python/ops/check_ops.py
+++ b/tensorflow/python/ops/check_ops.py
@@ -146,7 +146,7 @@ def _unary_assert_doc(sym, sym_name):
 
     Raises:
       InvalidArgumentError: if the check can be performed immediately and
-        `x {sym}` is False. The check can be performed immediately during 
+        `x {sym}` is False. The check can be performed immediately during
         eager execution or if `x` is statically known.
     """.format(
         sym=sym, sym_name=cap_sym_name, opname=opname)
@@ -209,7 +209,7 @@ def _binary_assert_doc(sym):
 
     Raises:
       InvalidArgumentError: if the check can be performed immediately and
-        `x {sym} y` is False. The check can be performed immediately during 
+        `x {sym} y` is False. The check can be performed immediately during
         eager execution or if `x` and `y` are statically known.
     """.format(
         sym=sym, opname=opname)
@@ -1634,7 +1634,7 @@ def assert_shapes_v2(shapes, data=None, summarize=None, message=None,
   >>> n = 10
   >>> q = 3
   >>> d = 7
-  >>> x = tf.zeros([n,q]) 
+  >>> x = tf.zeros([n,q])
   >>> y = tf.ones([n,d])
   >>> param = tf.Variable([1.0, 2.0, 3.0])
   >>> scalar = 1.0
@@ -1644,9 +1644,9 @@ def assert_shapes_v2(shapes, data=None, summarize=None, message=None,
   ...  (param, ('Q',)),
   ...  (scalar, ()),
   ... ])
-  
+
   >>> tf.debugging.assert_shapes([
-  ...   (x, ('N', 'D')), 
+  ...   (x, ('N', 'D')),
   ...   (y, ('N', 'D'))
   ... ])
   Traceback (most recent call last):
@@ -1745,8 +1745,23 @@ def assert_shapes(shapes, data=None, summarize=None, message=None, name=None):
   prefix) are both treated as having a single dimension of size one.
 
   Args:
-    shapes: dictionary with (`Tensor` to shape) items, or a list of
-      (`Tensor`, shape) tuples. A shape must be an iterable.
+    shapes: A list of (`Tensor`, `shape`) tuples, wherein `shape` is the
+      expected shape of `Tensor`. See the example code above. The `shape` must
+      be an iterable. Each element of the iterable can be either a concrete
+      integer value or a string that abstractly represents the dimension.
+      For example,
+        - `('N', 'Q')` specifies a 2D shape wherein the first and second
+          dimensions of shape may or may not be equal.
+        - `('N', 'N', 'Q')` specifies a 3D shape wherein the first and second
+          dimensions are equal.
+        - `(1, 'N')` specifies a 2D shape wherein the first dimension is
+          exactly 1 and the second dimension can be any value.
+      Note that the abstract dimension letters take effect across different
+      tuple elements of the list. For example,
+      `tf.debugging.assert_shapes([(x, ('N', 'A')), (y, ('N', 'B'))]` asserts
+      that both `x` and `y` are rank-2 tensors and their first dimensions are
+      equal (`N`).
+      `shape` can also be a `tf.TensorShape`.
     data: The tensors to print out if the condition is False.  Defaults to error
       message and first few entries of the violating tensor.
     summarize: Print this many entries of the tensor.
diff --git a/tensorflow/python/ops/collective_ops_test.py b/tensorflow/python/ops/collective_ops_test.py
index 47c25fcafc0..8e3a95d7dbf 100644
--- a/tensorflow/python/ops/collective_ops_test.py
+++ b/tensorflow/python/ops/collective_ops_test.py
@@ -104,39 +104,42 @@ class CollectiveOpTest(test.TestCase):
     for i in range(group_size * num_instances):
       self.assertAllClose(results[i], expected, rtol=1e-5, atol=1e-5)
 
-  @test_util.run_deprecated_v1
   def testCollectiveReduce(self):
-    self._testCollectiveReduce(
-        inputs=[[0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1],
-                [0.3, 1.3, 2.3, 3.3, 4.3, 5.3, 6.3, 7.3]],
-        expected=[0.2, 1.2, 2.2, 3.2, 4.2, 5.2, 6.2, 7.2],
-        set_graph_key=True)
+    # Tests that execute collectives need to be enclosed in graph or tf.function
+    with ops.Graph().as_default():
+      self._testCollectiveReduce(
+          inputs=[[0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1],
+                  [0.3, 1.3, 2.3, 3.3, 4.3, 5.3, 6.3, 7.3]],
+          expected=[0.2, 1.2, 2.2, 3.2, 4.2, 5.2, 6.2, 7.2],
+          set_graph_key=True)
 
-  @test_util.run_deprecated_v1
   def testCollectiveAutoGraphKey(self):
-    self._testCollectiveReduce(
-        inputs=[[0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1],
-                [0.3, 1.3, 2.3, 3.3, 4.3, 5.3, 6.3, 7.3]],
-        expected=[0.2, 1.2, 2.2, 3.2, 4.2, 5.2, 6.2, 7.2],
-        set_graph_key=False)
+    # Tests that execute collectives need to be enclosed in graph or tf.function
+    with ops.Graph().as_default():
+      self._testCollectiveReduce(
+          inputs=[[0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1],
+                  [0.3, 1.3, 2.3, 3.3, 4.3, 5.3, 6.3, 7.3]],
+          expected=[0.2, 1.2, 2.2, 3.2, 4.2, 5.2, 6.2, 7.2],
+          set_graph_key=False)
 
-  @test_util.run_deprecated_v1
   def testFp16Reduce(self):
-    self._testCollectiveReduce(
-        inputs=[[0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1],
-                [0.3, 1.3, 2.3, 3.3, 4.3, 5.3, 6.3, 7.3]],
-        expected=[0.2, 1.2, 2.2, 3.2, 4.2, 5.2, 6.2, 7.2],
-        set_graph_key=True,
-        fp16=True)
+    # Tests that execute collectives need to be enclosed in graph or tf.function
+    with ops.Graph().as_default():
+      self._testCollectiveReduce(
+          inputs=[[0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1],
+                  [0.3, 1.3, 2.3, 3.3, 4.3, 5.3, 6.3, 7.3]],
+          expected=[0.2, 1.2, 2.2, 3.2, 4.2, 5.2, 6.2, 7.2],
+          set_graph_key=True,
+          fp16=True)
 
-  @test_util.run_deprecated_v1
   def testCollectiveMultipleConcurrentReduce(self):
-    self._testMultipleConcurrentCollectiveReduce(
-        [0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1],
-        [0.3, 1.3, 2.3, 3.3, 4.3, 5.3, 6.3, 7.3],
-        [0.2, 1.2, 2.2, 3.2, 4.2, 5.2, 6.2, 7.2])
+    # Tests that execute collectives need to be enclosed in graph or tf.function
+    with ops.Graph().as_default():
+      self._testMultipleConcurrentCollectiveReduce(
+          [0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1],
+          [0.3, 1.3, 2.3, 3.3, 4.3, 5.3, 6.3, 7.3],
+          [0.2, 1.2, 2.2, 3.2, 4.2, 5.2, 6.2, 7.2])
 
-  @test_util.run_deprecated_v1
   def testCollectiveTimeoutV1(self):
     timeout = 4.5
     kwargs = dict(
@@ -145,14 +148,17 @@ class CollectiveOpTest(test.TestCase):
         set_graph_key=True,
         timeout=timeout)
 
-    self._testCollectiveReduce(**kwargs)
+    # Tests that execute collectives need to be enclosed in graph or tf.function
+    with ops.Graph().as_default():
+      self._testCollectiveReduce(**kwargs)
 
     start_time = time.time()
-    with self.assertRaisesRegex(
-        errors.DeadlineExceededError,
-        'Collective has timed out waiting for other workers'):
-      self._testCollectiveReduce(
-          reported_group_size=len(kwargs['inputs']) + 1, **kwargs)
+    with ops.Graph().as_default():
+      with self.assertRaisesRegex(
+          errors.DeadlineExceededError,
+          'Collective has timed out waiting for other workers'):
+        self._testCollectiveReduce(
+            reported_group_size=len(kwargs['inputs']) + 1, **kwargs)
     elapsed = time.time() - start_time
     self.assertAllGreaterEqual(elapsed, timeout)
 
@@ -199,17 +205,18 @@ class CollectiveOpTest(test.TestCase):
     elapsed = time.time() - start_time
     self.assertAllGreaterEqual(elapsed, timeout)
 
-  @test_util.run_deprecated_v1
   def testNcclHintFallbackToRingReduce(self):
     """Tests that setting `communication_hint=nccl` works on non-GPU builds."""
     if kernels.get_registered_kernels_for_op('NcclAllReduce'):
       self.skipTest('Run only on non-GPU environments')
-    self._testCollectiveReduce(
-        inputs=[[0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1],
-                [0.3, 1.3, 2.3, 3.3, 4.3, 5.3, 6.3, 7.3]],
-        expected=[0.2, 1.2, 2.2, 3.2, 4.2, 5.2, 6.2, 7.2],
-        set_graph_key=False,
-        communication_hint='nccl')
+    # Tests that execute collectives need to be enclosed in graph or tf.function
+    with ops.Graph().as_default():
+      self._testCollectiveReduce(
+          inputs=[[0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1],
+                  [0.3, 1.3, 2.3, 3.3, 4.3, 5.3, 6.3, 7.3]],
+          expected=[0.2, 1.2, 2.2, 3.2, 4.2, 5.2, 6.2, 7.2],
+          set_graph_key=False,
+          communication_hint='nccl')
 
   def _testWhile(self, num_vars, num_iterations, key_base):
     group_size = 2
@@ -262,15 +269,16 @@ class CollectiveOpTest(test.TestCase):
           [((1 << (num_iterations + v)) * 1.) for v in range(num_vars)]
           for _ in range(group_size)])
 
-  @test_util.run_deprecated_v1
   def testSimpleWhile(self):
-    self._testWhile(num_vars=1, num_iterations=4, key_base=20)
+    # Tests that execute collectives need to be enclosed in graph or tf.function
+    with ops.Graph().as_default():
+      self._testWhile(num_vars=1, num_iterations=4, key_base=20)
 
-  @test_util.run_deprecated_v1
   def testWhileMultipleAllReduce(self):
-    self._testWhile(num_vars=2, num_iterations=4, key_base=20)
+    # Tests that execute collectives need to be enclosed in graph or tf.function
+    with ops.Graph().as_default():
+      self._testWhile(num_vars=2, num_iterations=4, key_base=20)
 
-  @test_util.run_deprecated_v1
   def testWhileWithScopedAllocator(self):
     group_size = 2
     group_key = 1
@@ -284,47 +292,52 @@ class CollectiveOpTest(test.TestCase):
     del rewrite_options.scoped_allocator_opts.enable_op[:]
     rewrite_options.scoped_allocator_opts.enable_op.append('CollectiveReduce')
 
-    with self.session(config=config) as sess:
-      run_ops = []
-      for i in range(group_size):
-        with ops.device('CPU:%d' % i):
-          constant = constant_op.constant(0.)
-          cond = lambda i: math_ops.less(i, 10.)
-          body = lambda i: math_ops.add(i, 1.)
-          input0 = control_flow_ops.while_loop(cond, body, [constant])
-          input1 = math_ops.add(constant, 5)
-          colred0 = collective_ops.all_reduce(input0, group_size, group_key,
-                                              instance_key0, 'Add', 'Id')
-          colred1 = collective_ops.all_reduce(input1, group_size, group_key,
-                                              instance_key1, 'Add', 'Id')
-          run_ops.append(math_ops.add_n([colred0, colred1]))
-      results = sess.run(run_ops)
+    # Tests that execute collectives need to be enclosed in graph or tf.function
+    with ops.Graph().as_default():
+      with self.session(config=config) as sess:
+        run_ops = []
+        for i in range(group_size):
+          with ops.device('CPU:%d' % i):
+            constant = constant_op.constant(0.)
+            cond = lambda i: math_ops.less(i, 10.)
+            body = lambda i: math_ops.add(i, 1.)
+            input0 = control_flow_ops.while_loop(cond, body, [constant])
+            input1 = math_ops.add(constant, 5)
+            colred0 = collective_ops.all_reduce(input0, group_size, group_key,
+                                                instance_key0, 'Add', 'Id')
+            colred1 = collective_ops.all_reduce(input1, group_size, group_key,
+                                                instance_key1, 'Add', 'Id')
+            run_ops.append(math_ops.add_n([colred0, colred1]))
+        results = sess.run(run_ops)
       self.assertEqual(results, [30., 30.])
 
-  @test_util.run_deprecated_v1
   def testCollectiveReduceScalar(self):
-    self._testCollectiveReduce(inputs=[0.1, 0.3], expected=0.2,
-                               set_graph_key=True)
+    # Tests that execute collectives need to be enclosed in graph or tf.function
+    with ops.Graph().as_default():
+      self._testCollectiveReduce(inputs=[0.1, 0.3], expected=0.2,
+                                 set_graph_key=True)
 
-  @test_util.run_deprecated_v1
   def testCollectiveReduceMaximum(self):
-    self._testCollectiveReduce(
-        inputs=[[1., 20., 3., 40., 5.], [10., 2., 30., 4., 50.]],
-        expected=[10., 20., 30., 40., 50.],
-        set_graph_key=True,
-        instance_key=30,
-        merge_op='Max',
-        final_op='Id')
+    # Tests that execute collectives need to be enclosed in graph or tf.function
+    with ops.Graph().as_default():
+      self._testCollectiveReduce(
+          inputs=[[1., 20., 3., 40., 5.], [10., 2., 30., 4., 50.]],
+          expected=[10., 20., 30., 40., 50.],
+          set_graph_key=True,
+          instance_key=30,
+          merge_op='Max',
+          final_op='Id')
 
-  @test_util.run_deprecated_v1
   def testCollectiveReduceMinimum(self):
-    self._testCollectiveReduce(
-        inputs=[[1., 20., 3., 40., 5.], [10., 2., 30., 4., 50.]],
-        expected=[1., 2., 3., 4., 5.],
-        set_graph_key=True,
-        instance_key=40,
-        merge_op='Min',
-        final_op='Id')
+    # Tests that execute collectives need to be enclosed in graph or tf.function
+    with ops.Graph().as_default():
+      self._testCollectiveReduce(
+          inputs=[[1., 20., 3., 40., 5.], [10., 2., 30., 4., 50.]],
+          expected=[1., 2., 3., 4., 5.],
+          set_graph_key=True,
+          instance_key=40,
+          merge_op='Min',
+          final_op='Id')
 
   def _testCollectiveBroadcast(self, in_val):
     group_key = 1
@@ -345,13 +358,15 @@ class CollectiveOpTest(test.TestCase):
     self.assertAllClose(results[0], in_val, rtol=1e-5, atol=1e-5)
     self.assertAllClose(results[1], in_val, rtol=1e-5, atol=1e-5)
 
-  @test_util.run_deprecated_v1
   def testCollectiveBroadcast(self):
-    self._testCollectiveBroadcast([0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1])
+    # Tests that execute collectives need to be enclosed in graph or tf.function
+    with ops.Graph().as_default():
+      self._testCollectiveBroadcast([0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1])
 
-  @test_util.run_deprecated_v1
   def testCollectiveBroadcastBool(self):
-    self._testCollectiveBroadcast([True, False])
+    # Tests that execute collectives need to be enclosed in graph or tf.function
+    with ops.Graph().as_default():
+      self._testCollectiveBroadcast([True, False])
 
   def _testCollectiveGather(self, t0, t1, expected, set_graph_key):
     group_key = 1
@@ -371,94 +386,101 @@ class CollectiveOpTest(test.TestCase):
     self.assertAllClose(results[0], expected, rtol=1e-5, atol=1e-5)
     self.assertAllClose(results[1], expected, rtol=1e-5, atol=1e-5)
 
-  @test_util.run_deprecated_v1
   def testCollectiveGather(self):
-    self._testCollectiveGather([0, 1, 2, 3, 4, 5, 6, 7],
-                               [10, 11, 12, 13, 14, 15, 16, 17],
-                               [0, 1, 2, 3, 4, 5, 6, 7,
-                                10, 11, 12, 13, 14, 15, 16, 17],
-                               True)
-    self._testCollectiveGather([[0, 1, 2, 3], [4, 5, 6, 7]],
-                               [[10, 11, 12, 13], [14, 15, 16, 17]],
-                               [[0, 1, 2, 3], [4, 5, 6, 7],
-                                [10, 11, 12, 13], [14, 15, 16, 17]],
-                               True)
-    self._testCollectiveGather([[[0, 1], [2, 3]], [[4, 5], [6, 7]]],
-                               [[[10, 11], [12, 13]], [[14, 15], [16, 17]]],
-                               [[[0, 1], [2, 3]], [[4, 5], [6, 7]],
-                                [[10, 11], [12, 13]], [[14, 15], [16, 17]]],
-                               True)
+    # Tests that execute collectives need to be enclosed in graph or tf.function
+    with ops.Graph().as_default():
+      self._testCollectiveGather([0, 1, 2, 3, 4, 5, 6, 7],
+                                 [10, 11, 12, 13, 14, 15, 16, 17],
+                                 [0, 1, 2, 3, 4, 5, 6, 7,
+                                  10, 11, 12, 13, 14, 15, 16, 17],
+                                 True)
+      self._testCollectiveGather([[0, 1, 2, 3], [4, 5, 6, 7]],
+                                 [[10, 11, 12, 13], [14, 15, 16, 17]],
+                                 [[0, 1, 2, 3], [4, 5, 6, 7],
+                                  [10, 11, 12, 13], [14, 15, 16, 17]],
+                                 True)
+      self._testCollectiveGather([[[0, 1], [2, 3]], [[4, 5], [6, 7]]],
+                                 [[[10, 11], [12, 13]], [[14, 15], [16, 17]]],
+                                 [[[0, 1], [2, 3]], [[4, 5], [6, 7]],
+                                  [[10, 11], [12, 13]], [[14, 15], [16, 17]]],
+                                 True)
 
-  @test_util.run_deprecated_v1
   def testCollectiveGatherShapeMismatch(self):
     group_key = 1
     instance_key = 1
     t0 = [1, 2, 3, 4]
     t1 = [5, 6, 7, 8]
     t2 = [9, 10]
-    with self.session(
-        config=config_pb2.ConfigProto(device_count={'CPU': 2})) as sess:
-      with ops.device('/CPU:0'):
-        in0 = constant_op.constant(t0)
-        c0 = collective_ops.all_gather(in0, 2, group_key, instance_key)
-      with ops.device('/CPU:1'):
-        in1 = constant_op.constant(t1)
-        in2 = constant_op.constant(t2)
-        c1 = collective_ops.all_gather(in1, 2, group_key, instance_key)
-        c2 = collective_ops.all_gather(in2, 2, group_key, instance_key)
-      run_options = config_pb2.RunOptions()
-      run_options.experimental.collective_graph_key = 1
-      sess.run([c0, c1], options=run_options)
-      with self.assertRaisesRegexp(errors.InvalidArgumentError,
-                                   'Shape mismatch'):
-        sess.run([c0, c2], options=run_options)
+    # Tests that execute collectives need to be enclosed in graph or tf.function
+    with ops.Graph().as_default():
+      with self.session(
+          config=config_pb2.ConfigProto(device_count={'CPU': 2})) as sess:
+        with ops.device('/CPU:0'):
+          in0 = constant_op.constant(t0)
+          c0 = collective_ops.all_gather(in0, 2, group_key, instance_key)
+        with ops.device('/CPU:1'):
+          in1 = constant_op.constant(t1)
+          in2 = constant_op.constant(t2)
+          c1 = collective_ops.all_gather(in1, 2, group_key, instance_key)
+          c2 = collective_ops.all_gather(in2, 2, group_key, instance_key)
+        run_options = config_pb2.RunOptions()
+        run_options.experimental.collective_graph_key = 1
+        sess.run([c0, c1], options=run_options)
+        with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                     'Shape mismatch'):
+          sess.run([c0, c2], options=run_options)
 
-  @test_util.run_deprecated_v1
   def testCollectiveGatherShapeMismatchAcrossDevices(self):
     group_key = 1
     instance_key = 1
     t0 = [1, 2, 3, 4]
     t1 = [5, 6]
-    with self.session(
-        config=config_pb2.ConfigProto(device_count={'CPU': 2})) as sess:
-      with ops.device('/CPU:0'):
-        in0 = constant_op.constant(t0)
-        c0 = collective_ops.all_gather(in0, 2, group_key, instance_key)
-      with ops.device('/CPU:1'):
-        in1 = constant_op.constant(t1)
-        c1 = collective_ops.all_gather(in1, 2, group_key, instance_key)
-      run_options = config_pb2.RunOptions()
-      run_options.experimental.collective_graph_key = 1
-      with self.assertRaisesRegexp(errors.InvalidArgumentError,
-                                   'Shape mismatch'):
-        sess.run([c0, c1], options=run_options)
+    # Tests that execute collectives need to be enclosed in graph or tf.function
+    with ops.Graph().as_default():
+      with self.session(
+          config=config_pb2.ConfigProto(device_count={'CPU': 2})) as sess:
+        with ops.device('/CPU:0'):
+          in0 = constant_op.constant(t0)
+          c0 = collective_ops.all_gather(in0, 2, group_key, instance_key)
+        with ops.device('/CPU:1'):
+          in1 = constant_op.constant(t1)
+          c1 = collective_ops.all_gather(in1, 2, group_key, instance_key)
+        run_options = config_pb2.RunOptions()
+        run_options.experimental.collective_graph_key = 1
+        with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                     'Shape mismatch'):
+          sess.run([c0, c1], options=run_options)
 
-  @test_util.run_deprecated_v1
   def testCollectiveGatherPolymorphicShape(self):
     t0 = [0, 1, 2, 3, 4, 5, 6, 7]
     t1 = [10, 11, 12, 13, 14, 15, 16, 17]
     group_size = 2
     group_key = 1
     instance_key = 123
-    with self.session(
-        config=config_pb2.ConfigProto(
-            device_count={'CPU': group_size})) as sess:
-      with ops.device('/CPU:0'):
-        in0 = array_ops.placeholder(dtype=dtypes.int32, shape=[None])
-        c0 = collective_ops.all_gather(in0, group_size, group_key, instance_key)
-      with ops.device('/CPU:1'):
-        in1 = array_ops.placeholder(dtype=dtypes.int32, shape=[None])
-        c1 = collective_ops.all_gather(in1, group_size, group_key, instance_key)
+    # Tests that execute collectives need to be enclosed in graph or tf.function
+    with ops.Graph().as_default():
+      with self.session(
+          config=config_pb2.ConfigProto(
+              device_count={'CPU': group_size})) as sess:
+        with ops.device('/CPU:0'):
+          in0 = array_ops.placeholder(dtype=dtypes.int32, shape=[None])
+          c0 = collective_ops.all_gather(in0, group_size, group_key,
+                                         instance_key)
+        with ops.device('/CPU:1'):
+          in1 = array_ops.placeholder(dtype=dtypes.int32, shape=[None])
+          c1 = collective_ops.all_gather(in1, group_size, group_key,
+                                         instance_key)
 
-      results = sess.run([c0, c1], feed_dict={in0: t0, in1: t1})
-      expected_output = [0, 1, 2, 3, 4, 5, 6, 7, 10, 11, 12, 13, 14, 15, 16, 17]
-      self.assertAllClose(results[0], expected_output, rtol=1e-5, atol=1e-5)
-      self.assertAllClose(results[1], expected_output, rtol=1e-5, atol=1e-5)
+        results = sess.run([c0, c1], feed_dict={in0: t0, in1: t1})
+        results_ = sess.run([c0, c1], feed_dict={in0: t0[1:], in1: t1[1:]})
 
-      results_ = sess.run([c0, c1], feed_dict={in0: t0[1:], in1: t1[1:]})
-      expected_output_ = [1, 2, 3, 4, 5, 6, 7, 11, 12, 13, 14, 15, 16, 17]
-      self.assertAllClose(results_[0], expected_output_, rtol=1e-5, atol=1e-5)
-      self.assertAllClose(results_[1], expected_output_, rtol=1e-5, atol=1e-5)
+    expected_output = [0, 1, 2, 3, 4, 5, 6, 7, 10, 11, 12, 13, 14, 15, 16, 17]
+    self.assertAllClose(results[0], expected_output, rtol=1e-5, atol=1e-5)
+    self.assertAllClose(results[1], expected_output, rtol=1e-5, atol=1e-5)
+
+    expected_output_ = [1, 2, 3, 4, 5, 6, 7, 11, 12, 13, 14, 15, 16, 17]
+    self.assertAllClose(results_[0], expected_output_, rtol=1e-5, atol=1e-5)
+    self.assertAllClose(results_[1], expected_output_, rtol=1e-5, atol=1e-5)
 
   @test_util.run_v2_only
   def testCollectiveGroupSizeMismatch(self):
@@ -492,8 +514,17 @@ class CollectiveOpTest(test.TestCase):
                                  'but that group has size'):
       run_all_reduce()
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v2_only
   def testCollectiveTensorsHaveNoDeviceSpecified(self):
+    context._reset_context()
+    cpus = config.list_physical_devices('CPU')
+    self.assertEqual(len(cpus), 1)
+    config.set_logical_device_configuration(cpus[0], [
+        context.LogicalDeviceConfiguration(),
+        context.LogicalDeviceConfiguration()
+    ])
+    context.ensure_initialized()
+
     group_size = 2
     group_key = 1
     instance_key = 1
@@ -517,20 +548,12 @@ class CollectiveOpTest(test.TestCase):
 
       return results
 
-    with self.session(config=config_pb2.ConfigProto(
-        device_count={'CPU': 2})) as sess:
-      with ops.device('/CPU:0'):
-        in0 = constant_op.constant(1)
-      with ops.device('/CPU:1'):
-        in1 = constant_op.constant(3)
-
-      result_op = fn([in0, in1])
-
-      run_options = config_pb2.RunOptions()
-      run_options.experimental.collective_graph_key = 1
-      result = sess.run(result_op, options=run_options)
-
-      self.assertAllClose(result, [2, 2])
+    with ops.device('/CPU:0'):
+      in0 = constant_op.constant(1)
+    with ops.device('/CPU:1'):
+      in1 = constant_op.constant(3)
+    result = fn([in0, in1])
+    self.assertAllClose(result, [2, 2])
 
   @test_util.run_v2_only
   def testCollectiveGroupSizeOne(self):
@@ -548,7 +571,6 @@ class CollectiveOpTest(test.TestCase):
         in_tensor, group_size, group_key, instance_key)
     self.assertAllEqual(in_value, gathered_tensor.numpy())
 
-  @test_util.run_deprecated_v1
   def testConstantWithScopedAllocator(self):
     group_size = 2
     group_key = 1
@@ -565,21 +587,60 @@ class CollectiveOpTest(test.TestCase):
     del rewrite_options.scoped_allocator_opts.enable_op[:]
     rewrite_options.scoped_allocator_opts.enable_op.append('CollectiveReduce')
 
-    with self.session(config=cfg) as sess:
-      run_ops = []
-      for i in range(group_size):
-        with ops.device('CPU:%d' % i):
-          constant = constant_op.constant(i + 1.)
-          input_tensor1 = array_ops.identity(constant)
-          input_tensor2 = array_ops.identity(constant)
-          reduced_tensor1 = collective_ops.all_reduce(
-              input_tensor1, group_size, group_key, instance_key1, 'Add', 'Id')
-          reduced_tensor2 = collective_ops.all_reduce(
-              input_tensor2, group_size, group_key, instance_key2, 'Add', 'Id')
-          run_ops.append(array_ops.identity(reduced_tensor1))
-          run_ops.append(array_ops.identity(reduced_tensor2))
-      results = sess.run(run_ops)
-      self.assertEqual(results, [3., 3., 3., 3.])
+    # Tests that execute collectives need to be enclosed in graph or tf.function
+    with ops.Graph().as_default():
+      with self.session(config=cfg) as sess:
+        run_ops = []
+        for i in range(group_size):
+          with ops.device('CPU:%d' % i):
+            constant = constant_op.constant(i + 1.)
+            input_tensor1 = array_ops.identity(constant)
+            input_tensor2 = array_ops.identity(constant)
+            reduced_tensor1 = collective_ops.all_reduce(
+                input_tensor1, group_size, group_key, instance_key1, 'Add',
+                'Id')
+            reduced_tensor2 = collective_ops.all_reduce(
+                input_tensor2, group_size, group_key, instance_key2, 'Add',
+                'Id')
+            run_ops.append(array_ops.identity(reduced_tensor1))
+            run_ops.append(array_ops.identity(reduced_tensor2))
+        results = sess.run(run_ops)
+    self.assertEqual(results, [3., 3., 3., 3.])
+
+  @test_util.run_v2_only
+  def testMultipleGroups(self):
+    context._reset_context()
+    cpus = config.list_physical_devices('CPU')
+    self.assertEqual(len(cpus), 1)
+    config.set_logical_device_configuration(cpus[0], [
+        context.LogicalDeviceConfiguration(),
+        context.LogicalDeviceConfiguration(),
+        context.LogicalDeviceConfiguration()
+    ])
+    context.ensure_initialized()
+    num_elements = 4
+
+    @def_function.function
+    def run_all_reduce(group_size, group_key):
+      instance_key = group_key
+      input_value = [group_key for i in range(num_elements)]
+      collectives = []
+      for device_idx in range(group_size):
+        with ops.device('/CPU:{}'.format(device_idx)):
+          input_tensor = constant_op.constant(input_value)
+          collectives.append(collective_ops.all_reduce(
+              input_tensor, group_size, group_key, instance_key, merge_op='Add',
+              final_op='Id'))
+      return collectives
+
+    def run_and_assert(group_size, group_key):
+      for reduced_tensor in run_all_reduce(group_size, group_key):
+        self.assertAllEqual(
+            [group_key * group_size for i in range(num_elements)],
+            reduced_tensor.numpy())
+
+    run_and_assert(group_size=2, group_key=1)
+    run_and_assert(group_size=3, group_key=2)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/ops/control_flow_ops.py b/tensorflow/python/ops/control_flow_ops.py
index 3398308d42e..748f842a9e0 100644
--- a/tensorflow/python/ops/control_flow_ops.py
+++ b/tensorflow/python/ops/control_flow_ops.py
@@ -3682,6 +3682,11 @@ class XLAControlFlowContext(ControlFlowContext):
   def AddValue(self, x):
     return x
 
+  def RequiresUniqueFunctionRetracing(self):
+    """Returns whether the tf.function should be retraced if the context changes.
+    """
+    return False
+
 
 def from_control_flow_context_def(context_def, import_scope=None):
   """Deserializes `context_def` into the appropriate ControlFlowContext.
diff --git a/tensorflow/python/ops/control_flow_ops_test.py b/tensorflow/python/ops/control_flow_ops_test.py
index 3ca9bda82f2..33d061b2b72 100644
--- a/tensorflow/python/ops/control_flow_ops_test.py
+++ b/tensorflow/python/ops/control_flow_ops_test.py
@@ -1274,6 +1274,26 @@ class ExecuteFnForDeviceTest(test_util.TensorFlowTestCase):
       self.assertEqual(6., self.evaluate(result))
       self.assertEqual([2.], self.evaluate(grad))
 
+  def testCompile(self):
+    if not test_util.is_gpu_available():
+      return
+
+    def cpu_fn(x):
+      return x + x
+
+    def gpu_fn(x):
+      return x * x
+
+    @def_function.function(experimental_compile=True)
+    def flexible_defun(a):
+      branches = {"CPU": lambda: cpu_fn(a), "GPU": lambda: gpu_fn(a)}
+      return control_flow_ops.execute_fn_for_device(branches, lambda: cpu_fn(a))
+
+    # Always execute the default branch in xla compilation case.
+    a = array_ops.constant(3.)
+    r = flexible_defun(a)
+    self.assertEqual(6., self.evaluate(r))
+
   def testFallBack(self):
 
     def default_fn(x):
diff --git a/tensorflow/python/ops/image_ops_impl.py b/tensorflow/python/ops/image_ops_impl.py
index a934639d524..bbce25724e7 100644
--- a/tensorflow/python/ops/image_ops_impl.py
+++ b/tensorflow/python/ops/image_ops_impl.py
@@ -2634,6 +2634,23 @@ def decode_image(contents,
     ValueError: On incorrect number of channels.
   """
   with ops.name_scope(name, 'decode_image'):
+    if compat.forward_compatible(2020, 7, 14):
+      channels = 0 if channels is None else channels
+      if dtype not in [dtypes.float32, dtypes.uint8, dtypes.uint16]:
+        dest_dtype = dtype
+        dtype = dtypes.uint16
+        return convert_image_dtype(gen_image_ops.decode_image(
+            contents=contents,
+            channels=channels,
+            expand_animations=expand_animations,
+            dtype=dtype), dest_dtype)
+      else:
+        return gen_image_ops.decode_image(
+            contents=contents,
+            channels=channels,
+            expand_animations=expand_animations,
+            dtype=dtype)
+
     if channels not in (None, 0, 1, 3, 4):
       raise ValueError('channels must be in (None, 0, 1, 3, 4)')
     substr = string_ops.substr(contents, 0, 3)
@@ -3642,20 +3659,25 @@ def ssim(img1,
     values are in range (-1, 1], when pixel values are non-negative. Returns
     a tensor with shape: broadcast(img1.shape[:-3], img2.shape[:-3]).
   """
-  _, _, checks = _verify_compatible_image_shapes(img1, img2)
-  with ops.control_dependencies(checks):
-    img1 = array_ops.identity(img1)
+  with ops.name_scope(None, 'SSIM', [img1, img2]):
+    # Convert to tensor if needed.
+    img1 = ops.convert_to_tensor(img1, name='img1')
+    img2 = ops.convert_to_tensor(img2, name='img2')
+    # Shape checking.
+    _, _, checks = _verify_compatible_image_shapes(img1, img2)
+    with ops.control_dependencies(checks):
+      img1 = array_ops.identity(img1)
 
-  # Need to convert the images to float32.  Scale max_val accordingly so that
-  # SSIM is computed correctly.
-  max_val = math_ops.cast(max_val, img1.dtype)
-  max_val = convert_image_dtype(max_val, dtypes.float32)
-  img1 = convert_image_dtype(img1, dtypes.float32)
-  img2 = convert_image_dtype(img2, dtypes.float32)
-  ssim_per_channel, _ = _ssim_per_channel(img1, img2, max_val, filter_size,
-                                          filter_sigma, k1, k2)
-  # Compute average over color channels.
-  return math_ops.reduce_mean(ssim_per_channel, [-1])
+    # Need to convert the images to float32.  Scale max_val accordingly so that
+    # SSIM is computed correctly.
+    max_val = math_ops.cast(max_val, img1.dtype)
+    max_val = convert_image_dtype(max_val, dtypes.float32)
+    img1 = convert_image_dtype(img1, dtypes.float32)
+    img2 = convert_image_dtype(img2, dtypes.float32)
+    ssim_per_channel, _ = _ssim_per_channel(img1, img2, max_val, filter_size,
+                                            filter_sigma, k1, k2)
+    # Compute average over color channels.
+    return math_ops.reduce_mean(ssim_per_channel, [-1])
 
 
 # Default values obtained by Wang et al.
diff --git a/tensorflow/python/ops/image_ops_test.py b/tensorflow/python/ops/image_ops_test.py
index 3530885fe07..a05209c2038 100644
--- a/tensorflow/python/ops/image_ops_test.py
+++ b/tensorflow/python/ops/image_ops_test.py
@@ -30,6 +30,7 @@ from six.moves import xrange  # pylint: disable=redefined-builtin
 
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.client import session
+from tensorflow.python.compat import compat
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
@@ -4865,6 +4866,29 @@ class SSIMTest(test_util.TensorFlowTestCase):
     with self.cached_session(use_gpu=True):
       self.assertAllClose(expected, self.evaluate(ssim), atol=1e-4)
 
+  def testBatchNumpyInputs(self):
+    img = self._LoadTestImages()
+    expected = self._ssim[np.triu_indices(3, k=1)]
+
+    img1, img2 = zip(*itertools.combinations(img, 2))
+    img1 = np.concatenate(img1)
+    img2 = np.concatenate(img2)
+
+    with self.cached_session(use_gpu=True):
+      img1 = self.evaluate(constant_op.constant(img1))
+      img2 = self.evaluate(constant_op.constant(img2))
+
+    ssim = image_ops.ssim(
+        img1,
+        img2,
+        1.0,
+        filter_size=11,
+        filter_sigma=1.5,
+        k1=0.01,
+        k2=0.03)
+    with self.cached_session(use_gpu=True):
+      self.assertAllClose(expected, self.evaluate(ssim), atol=1e-4)
+
   def testBroadcast(self):
     img = self._LoadTestImages()[:2]
     expected = self._ssim[:2, :2]
@@ -5150,107 +5174,141 @@ class SobelEdgesTest(test_util.TensorFlowTestCase):
 @test_util.run_all_in_graph_and_eager_modes
 class DecodeImageTest(test_util.TensorFlowTestCase):
 
+  _FORWARD_COMPATIBILITY_HORIZONS = [
+      (2020, 6, 11),
+      (2020, 7, 11),
+      (2525, 1, 1),  # future behavior
+  ]
+
   def testJpegUint16(self):
-    with self.cached_session(use_gpu=True) as sess:
-      base = "tensorflow/core/lib/jpeg/testdata"
-      jpeg0 = io_ops.read_file(os.path.join(base, "jpeg_merge_test1.jpg"))
-      image0 = image_ops.decode_image(jpeg0, dtype=dtypes.uint16)
-      image1 = image_ops.convert_image_dtype(image_ops.decode_jpeg(jpeg0),
-                                             dtypes.uint16)
-      image0, image1 = self.evaluate([image0, image1])
-      self.assertAllEqual(image0, image1)
+    for horizon in self._FORWARD_COMPATIBILITY_HORIZONS:
+      with compat.forward_compatibility_horizon(*horizon):
+        with self.cached_session(use_gpu=True) as sess:
+          base = "tensorflow/core/lib/jpeg/testdata"
+          jpeg0 = io_ops.read_file(os.path.join(base, "jpeg_merge_test1.jpg"))
+          image0 = image_ops.decode_image(jpeg0, dtype=dtypes.uint16)
+          image1 = image_ops.convert_image_dtype(image_ops.decode_jpeg(jpeg0),
+                                                 dtypes.uint16)
+          image0, image1 = self.evaluate([image0, image1])
+          self.assertAllEqual(image0, image1)
 
   def testPngUint16(self):
-    with self.cached_session(use_gpu=True) as sess:
-      base = "tensorflow/core/lib/png/testdata"
-      png0 = io_ops.read_file(os.path.join(base, "lena_rgba.png"))
-      image0 = image_ops.decode_image(png0, dtype=dtypes.uint16)
-      image1 = image_ops.convert_image_dtype(
-          image_ops.decode_png(png0, dtype=dtypes.uint16), dtypes.uint16)
-      image0, image1 = self.evaluate([image0, image1])
-      self.assertAllEqual(image0, image1)
+    for horizon in self._FORWARD_COMPATIBILITY_HORIZONS:
+      with compat.forward_compatibility_horizon(*horizon):
+        with self.cached_session(use_gpu=True) as sess:
+          base = "tensorflow/core/lib/png/testdata"
+          png0 = io_ops.read_file(os.path.join(base, "lena_rgba.png"))
+          image0 = image_ops.decode_image(png0, dtype=dtypes.uint16)
+          image1 = image_ops.convert_image_dtype(
+              image_ops.decode_png(png0, dtype=dtypes.uint16), dtypes.uint16)
+          image0, image1 = self.evaluate([image0, image1])
+          self.assertAllEqual(image0, image1)
 
-      # NumPy conversions should happen before
-      x = np.random.randint(256, size=(4, 4, 3), dtype=np.uint16)
-      x_str = image_ops_impl.encode_png(x)
-      x_dec = image_ops_impl.decode_image(
-          x_str, channels=3, dtype=dtypes.uint16)
-      self.assertAllEqual(x, x_dec)
+          # NumPy conversions should happen before
+          x = np.random.randint(256, size=(4, 4, 3), dtype=np.uint16)
+          x_str = image_ops_impl.encode_png(x)
+          x_dec = image_ops_impl.decode_image(
+              x_str, channels=3, dtype=dtypes.uint16)
+          self.assertAllEqual(x, x_dec)
 
   def testGifUint16(self):
-    with self.cached_session(use_gpu=True) as sess:
-      base = "tensorflow/core/lib/gif/testdata"
-      gif0 = io_ops.read_file(os.path.join(base, "scan.gif"))
-      image0 = image_ops.decode_image(gif0, dtype=dtypes.uint16)
-      image1 = image_ops.convert_image_dtype(image_ops.decode_gif(gif0),
-                                             dtypes.uint16)
-      image0, image1 = self.evaluate([image0, image1])
-      self.assertAllEqual(image0, image1)
+    for horizon in self._FORWARD_COMPATIBILITY_HORIZONS:
+      with compat.forward_compatibility_horizon(*horizon):
+        with self.cached_session(use_gpu=True) as sess:
+          base = "tensorflow/core/lib/gif/testdata"
+          gif0 = io_ops.read_file(os.path.join(base, "scan.gif"))
+          image0 = image_ops.decode_image(gif0, dtype=dtypes.uint16)
+          image1 = image_ops.convert_image_dtype(image_ops.decode_gif(gif0),
+                                                 dtypes.uint16)
+          image0, image1 = self.evaluate([image0, image1])
+          self.assertAllEqual(image0, image1)
 
   def testBmpUint16(self):
-    with self.cached_session(use_gpu=True) as sess:
-      base = "tensorflow/core/lib/bmp/testdata"
-      bmp0 = io_ops.read_file(os.path.join(base, "lena.bmp"))
-      image0 = image_ops.decode_image(bmp0, dtype=dtypes.uint16)
-      image1 = image_ops.convert_image_dtype(image_ops.decode_bmp(bmp0),
-                                             dtypes.uint16)
-      image0, image1 = self.evaluate([image0, image1])
-      self.assertAllEqual(image0, image1)
+    for horizon in self._FORWARD_COMPATIBILITY_HORIZONS:
+      with compat.forward_compatibility_horizon(*horizon):
+        with self.cached_session(use_gpu=True) as sess:
+          base = "tensorflow/core/lib/bmp/testdata"
+          bmp0 = io_ops.read_file(os.path.join(base, "lena.bmp"))
+          image0 = image_ops.decode_image(bmp0, dtype=dtypes.uint16)
+          image1 = image_ops.convert_image_dtype(image_ops.decode_bmp(bmp0),
+                                                 dtypes.uint16)
+          image0, image1 = self.evaluate([image0, image1])
+          self.assertAllEqual(image0, image1)
 
   def testJpegFloat32(self):
-    with self.cached_session(use_gpu=True) as sess:
-      base = "tensorflow/core/lib/jpeg/testdata"
-      jpeg0 = io_ops.read_file(os.path.join(base, "jpeg_merge_test1.jpg"))
-      image0 = image_ops.decode_image(jpeg0, dtype=dtypes.float32)
-      image1 = image_ops.convert_image_dtype(image_ops.decode_jpeg(jpeg0),
-                                             dtypes.float32)
-      image0, image1 = self.evaluate([image0, image1])
-      self.assertAllEqual(image0, image1)
+    for horizon in self._FORWARD_COMPATIBILITY_HORIZONS:
+      with compat.forward_compatibility_horizon(*horizon):
+        with self.cached_session(use_gpu=True) as sess:
+          base = "tensorflow/core/lib/jpeg/testdata"
+          jpeg0 = io_ops.read_file(os.path.join(base, "jpeg_merge_test1.jpg"))
+          image0 = image_ops.decode_image(jpeg0, dtype=dtypes.float32)
+          image1 = image_ops.convert_image_dtype(image_ops.decode_jpeg(jpeg0),
+                                                 dtypes.float32)
+          image0, image1 = self.evaluate([image0, image1])
+          self.assertAllEqual(image0, image1)
 
   def testPngFloat32(self):
-    with self.cached_session(use_gpu=True) as sess:
-      base = "tensorflow/core/lib/png/testdata"
-      png0 = io_ops.read_file(os.path.join(base, "lena_rgba.png"))
-      image0 = image_ops.decode_image(png0, dtype=dtypes.float32)
-      image1 = image_ops.convert_image_dtype(
-          image_ops.decode_png(png0, dtype=dtypes.uint16), dtypes.float32)
-      image0, image1 = self.evaluate([image0, image1])
-      self.assertAllEqual(image0, image1)
+    for horizon in self._FORWARD_COMPATIBILITY_HORIZONS:
+      with compat.forward_compatibility_horizon(*horizon):
+        with self.cached_session(use_gpu=True) as sess:
+          base = "tensorflow/core/lib/png/testdata"
+          png0 = io_ops.read_file(os.path.join(base, "lena_rgba.png"))
+          image0 = image_ops.decode_image(png0, dtype=dtypes.float32)
+          image1 = image_ops.convert_image_dtype(
+              image_ops.decode_png(png0, dtype=dtypes.uint16), dtypes.float32)
+          image0, image1 = self.evaluate([image0, image1])
+          self.assertAllEqual(image0, image1)
 
   def testGifFloat32(self):
-    with self.cached_session(use_gpu=True) as sess:
-      base = "tensorflow/core/lib/gif/testdata"
-      gif0 = io_ops.read_file(os.path.join(base, "scan.gif"))
-      image0 = image_ops.decode_image(gif0, dtype=dtypes.float32)
-      image1 = image_ops.convert_image_dtype(image_ops.decode_gif(gif0),
-                                             dtypes.float32)
-      image0, image1 = self.evaluate([image0, image1])
-      self.assertAllEqual(image0, image1)
+    for horizon in self._FORWARD_COMPATIBILITY_HORIZONS:
+      with compat.forward_compatibility_horizon(*horizon):
+        with self.cached_session(use_gpu=True) as sess:
+          base = "tensorflow/core/lib/gif/testdata"
+          gif0 = io_ops.read_file(os.path.join(base, "scan.gif"))
+          image0 = image_ops.decode_image(gif0, dtype=dtypes.float32)
+          image1 = image_ops.convert_image_dtype(image_ops.decode_gif(gif0),
+                                                 dtypes.float32)
+          image0, image1 = self.evaluate([image0, image1])
+          self.assertAllEqual(image0, image1)
 
   def testBmpFloat32(self):
-    with self.cached_session(use_gpu=True) as sess:
-      base = "tensorflow/core/lib/bmp/testdata"
-      bmp0 = io_ops.read_file(os.path.join(base, "lena.bmp"))
-      image0 = image_ops.decode_image(bmp0, dtype=dtypes.float32)
-      image1 = image_ops.convert_image_dtype(image_ops.decode_bmp(bmp0),
-                                             dtypes.float32)
-      image0, image1 = self.evaluate([image0, image1])
-      self.assertAllEqual(image0, image1)
+    for horizon in self._FORWARD_COMPATIBILITY_HORIZONS:
+      with compat.forward_compatibility_horizon(*horizon):
+        with self.cached_session(use_gpu=True) as sess:
+          base = "tensorflow/core/lib/bmp/testdata"
+          bmp0 = io_ops.read_file(os.path.join(base, "lena.bmp"))
+          image0 = image_ops.decode_image(bmp0, dtype=dtypes.float32)
+          image1 = image_ops.convert_image_dtype(image_ops.decode_bmp(bmp0),
+                                                 dtypes.float32)
+          image0, image1 = self.evaluate([image0, image1])
+          self.assertAllEqual(image0, image1)
 
   def testExpandAnimations(self):
-    with self.cached_session(use_gpu=True) as sess:
-      base = "tensorflow/core/lib/gif/testdata"
-      gif0 = io_ops.read_file(os.path.join(base, "scan.gif"))
-      image0 = image_ops.decode_image(
-          gif0, dtype=dtypes.float32, expand_animations=False)
-      # image_ops.decode_png() handles GIFs and returns 3D tensors
-      animation = image_ops.decode_gif(gif0)
-      first_frame = array_ops.gather(animation, 0)
-      image1 = image_ops.convert_image_dtype(first_frame, dtypes.float32)
-      image0, image1 = self.evaluate([image0, image1])
-      self.assertEqual(len(image0.shape), 3)
-      self.assertAllEqual(list(image0.shape), [40, 20, 3])
-      self.assertAllEqual(image0, image1)
+    for horizon in self._FORWARD_COMPATIBILITY_HORIZONS:
+      with compat.forward_compatibility_horizon(*horizon):
+        with self.cached_session(use_gpu=True) as sess:
+          base = "tensorflow/core/lib/gif/testdata"
+          gif0 = io_ops.read_file(os.path.join(base, "scan.gif"))
+
+          # Test `expand_animations=False` case.
+          image0 = image_ops.decode_image(
+              gif0, dtype=dtypes.float32, expand_animations=False)
+          # image_ops.decode_png() handles GIFs and returns 3D tensors
+          animation = image_ops.decode_gif(gif0)
+          first_frame = array_ops.gather(animation, 0)
+          image1 = image_ops.convert_image_dtype(first_frame, dtypes.float32)
+          image0, image1 = self.evaluate([image0, image1])
+          self.assertEqual(len(image0.shape), 3)
+          self.assertAllEqual(list(image0.shape), [40, 20, 3])
+          self.assertAllEqual(image0, image1)
+
+          # Test `expand_animations=True` case.
+          image2 = image_ops.decode_image(gif0, dtype=dtypes.float32)
+          image3 = image_ops.convert_image_dtype(animation, dtypes.float32)
+          image2, image3 = self.evaluate([image2, image3])
+          self.assertEqual(len(image2.shape), 4)
+          self.assertAllEqual(list(image2.shape), [12, 40, 20, 3])
+          self.assertAllEqual(image2, image3)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/ops/linalg/linalg_impl.py b/tensorflow/python/ops/linalg/linalg_impl.py
index 82acd09caec..9ddf7b5e8b8 100644
--- a/tensorflow/python/ops/linalg/linalg_impl.py
+++ b/tensorflow/python/ops/linalg/linalg_impl.py
@@ -340,6 +340,102 @@ def matrix_exponential(input, name=None):  # pylint: disable=redefined-builtin
     return array_ops.reshape(result, batch_shape.concatenate(result.shape[-2:]))
 
 
+@tf_export('linalg.banded_triangular_solve', v1=[])
+def banded_triangular_solve(
+    bands,
+    rhs,
+    lower=True,
+    adjoint=False,  # pylint: disable=redefined-outer-name
+    name=None):
+  r"""Solve triangular systems of equations with a banded solver.
+
+  `bands` is a tensor of shape `[..., K, M]`, where `K` represents the number
+  of bands stored. This corresponds to a batch of `M` by `M` matrices, whose
+  `K` subdiagonals (when `lower` is `True`) are stored.
+
+  This operator broadcasts the batch dimensions of `bands` and the batch
+  dimensions of `rhs`.
+
+
+  Examples:
+
+  Storing 2 bands of a 3x3 matrix.
+  Note that first element in the second row is ignored due to
+  the 'LEFT_RIGHT' padding.
+
+  >>> x = [[2., 3., 4.], [1., 2., 3.]]
+  >>> x2 = [[2., 3., 4.], [10000., 2., 3.]]
+  >>> y = tf.zeros([3, 3])
+  >>> z = tf.linalg.set_diag(y, x, align='LEFT_RIGHT', k=(-1, 0))
+  >>> z
+  <tf.Tensor: shape=(3, 3), dtype=float32, numpy=
+  array([[2., 0., 0.],
+         [2., 3., 0.],
+         [0., 3., 4.]], dtype=float32)>
+  >>> soln = tf.linalg.banded_triangular_solve(x, tf.ones([3, 1]))
+  >>> soln
+  <tf.Tensor: shape=(3, 1), dtype=float32, numpy=
+  array([[0.5 ],
+         [0.  ],
+         [0.25]], dtype=float32)>
+  >>> are_equal = soln == tf.linalg.banded_triangular_solve(x2, tf.ones([3, 1]))
+  >>> tf.reduce_all(are_equal).numpy()
+  True
+  >>> are_equal = soln == tf.linalg.triangular_solve(z, tf.ones([3, 1]))
+  >>> tf.reduce_all(are_equal).numpy()
+  True
+
+  Storing 2 superdiagonals of a 4x4 matrix. Because of the 'LEFT_RIGHT' padding
+  the last element of the first row is ignored.
+
+  >>> x = [[2., 3., 4., 5.], [-1., -2., -3., -4.]]
+  >>> y = tf.zeros([4, 4])
+  >>> z = tf.linalg.set_diag(y, x, align='LEFT_RIGHT', k=(0, 1))
+  >>> z
+  <tf.Tensor: shape=(4, 4), dtype=float32, numpy=
+  array([[-1.,  2.,  0.,  0.],
+         [ 0., -2.,  3.,  0.],
+         [ 0.,  0., -3.,  4.],
+         [ 0.,  0., -0., -4.]], dtype=float32)>
+  >>> soln = tf.linalg.banded_triangular_solve(x, tf.ones([4, 1]), lower=False)
+  >>> soln
+  <tf.Tensor: shape=(4, 1), dtype=float32, numpy=
+  array([[-4.       ],
+         [-1.5      ],
+         [-0.6666667],
+         [-0.25     ]], dtype=float32)>
+  >>> are_equal = (soln == tf.linalg.triangular_solve(
+  ...   z, tf.ones([4, 1]), lower=False))
+  >>> tf.reduce_all(are_equal).numpy()
+  True
+
+
+  Args:
+    bands: A `Tensor` describing the bands of the left hand side, with shape
+      `[..., K, M]`. The `K` rows correspond to the diagonal to the `K - 1`-th
+      diagonal (the diagonal is the top row) when `lower` is `True` and
+      otherwise the `K - 1`-th superdiagonal to the diagonal (the diagonal is
+      the bottom row) when `lower` is `False`. The bands are stored with
+      'LEFT_RIGHT' alignment, where the superdiagonals are padded on the right
+      and subdiagonals are padded on the left. This is the alignment cuSPARSE
+      uses.  See  `tf.linalg.set_diag` for more details.
+    rhs: A `Tensor` of shape [..., M] or [..., M, N] and with the same dtype as
+      `diagonals`. Note that if the shape of `rhs` and/or `diags` isn't known
+      statically, `rhs` will be treated as a matrix rather than a vector.
+    lower: An optional `bool`. Defaults to `True`. Boolean indicating whether
+      `bands` represents a lower or upper triangular matrix.
+    adjoint: An optional `bool`. Defaults to `False`. Boolean indicating whether
+      to solve with the matrix's block-wise adjoint.
+    name:  A name to give this `Op` (optional).
+
+  Returns:
+    A `Tensor` of shape [..., M] or [..., M, N] containing the solutions.
+  """
+  with ops.name_scope(name, 'banded_triangular_solve', [bands, rhs]):
+    return gen_linalg_ops.banded_triangular_solve(
+        bands, rhs, lower=lower, adjoint=adjoint)
+
+
 @tf_export('linalg.tridiagonal_solve')
 @dispatch.add_dispatch_support
 def tridiagonal_solve(diagonals,
diff --git a/tensorflow/python/ops/linalg_grad.py b/tensorflow/python/ops/linalg_grad.py
index 8d3664144a1..437e28e7e6b 100644
--- a/tensorflow/python/ops/linalg_grad.py
+++ b/tensorflow/python/ops/linalg_grad.py
@@ -607,6 +607,39 @@ def _MatrixSolveLsGrad(op, grad):
                                  lambda: _Underdetermined(op, grad))
 
 
+@ops.RegisterGradient("BandedTriangularSolve")
+def _BandedTriangularSolveGrad(op, grad):
+  """Gradient for BandedTriangularSolve."""
+  a = op.inputs[0]
+  b = op.inputs[1]
+  num_bands = array_ops.shape(a)[-2]
+  adjoint_a = op.get_attr("adjoint")
+  lower_a = op.get_attr("lower")
+  c = op.outputs[0]
+  grad_b = linalg_ops.banded_triangular_solve(
+      a, grad, lower=lower_a, adjoint=not adjoint_a)
+  if adjoint_a:
+    grad_a = -math_ops.matmul(c, grad_b, adjoint_b=True)
+  else:
+    grad_a = -math_ops.matmul(grad_b, c, adjoint_b=True)
+  if lower_a:
+    grad_a = array_ops.matrix_diag_part(
+        grad_a, k=(-(num_bands - 1), 0), align="LEFT_RIGHT")
+  else:
+    grad_a = array_ops.matrix_diag_part(
+        grad_a, k=(0, num_bands - 1), align="LEFT_RIGHT")
+  # If the static batch shapes are equal, we don't need to unbroadcast.
+  if (a.shape.is_fully_defined() and b.shape.is_fully_defined() and
+      a.shape[:-2] == b.shape[:-2]):
+    return grad_a, grad_b
+  a_shape = array_ops.shape(a)
+  b_shape = array_ops.shape(b)
+  ra, rb = array_ops.broadcast_gradient_args(a_shape[:-2], b_shape[:-2])
+  grad_a = array_ops.reshape(math_ops.reduce_sum(grad_a, axis=ra), a_shape)
+  grad_b = array_ops.reshape(math_ops.reduce_sum(grad_b, axis=rb), b_shape)
+  return grad_a, grad_b
+
+
 @ops.RegisterGradient("MatrixTriangularSolve")
 def _MatrixTriangularSolveGrad(op, grad):
   """Gradient for MatrixTriangularSolve."""
diff --git a/tensorflow/python/ops/logging_ops.py b/tensorflow/python/ops/logging_ops.py
index 8ca63f55987..02fce277690 100644
--- a/tensorflow/python/ops/logging_ops.py
+++ b/tensorflow/python/ops/logging_ops.py
@@ -54,11 +54,9 @@ except NameError:
 # call relies on certain conditionals for its dependencies.  Use
 # control_flow_ops.Assert.
 
-# Assert and Print are special symbols in python, so we must
-# have an upper-case version of them.
-#
-# For users with Python 3 or Python 2.7
-# with `from __future__ import print_function`, we could also allow lowercase.
+# Assert and Print are special symbols in Python 2, so we must
+# have an upper-case version of them. When support for it is dropped,
+# we can allow lowercase.
 # See https://github.com/tensorflow/tensorflow/issues/18053
 
 
@@ -83,11 +81,6 @@ def Print(input_, data, message=None, first_n=None, summarize=None, name=None):
     with jupyter notebook (printing to the notebook *server's* output, not into
     the notebook).
 
-  Additionally, to use tf.print in python 2.7, users must make sure to import
-  the following:
-
-  `from __future__ import print_function`
-
   Args:
     input_: A tensor passed through this op.
     data: A list of tensors to print out when op is evaluated.
@@ -148,11 +141,6 @@ def print_v2(*inputs, **kwargs):
   Python objects. Printed tensors will recursively show the first and last
   elements of each dimension to summarize.
 
-  @compatibility(python2)
-  In python 2.7, make sure to import the following:
-  `from __future__ import print_function`
-  @end_compatibility
-
   Example:
     Single-input usage:
 
diff --git a/tensorflow/python/ops/losses/losses_impl.py b/tensorflow/python/ops/losses/losses_impl.py
index 6a7b4b68420..2887b3d78f9 100644
--- a/tensorflow/python/ops/losses/losses_impl.py
+++ b/tensorflow/python/ops/losses/losses_impl.py
@@ -676,8 +676,9 @@ def sigmoid_cross_entropy(
       `{0, 1}`.
     logits: Float `[batch_size, num_classes]` logits outputs of the network.
     weights: Optional `Tensor` whose rank is either 0, or the same rank as
-      `labels`, and must be broadcastable to `labels` (i.e., all dimensions must
-      be either `1`, or the same as the corresponding `losses` dimension).
+    `multi_class_labels`, and must be broadcastable to `multi_class_labels` 
+    (i.e., all dimensions must be either `1`, or the same as the 
+    corresponding `losses` dimension).
     label_smoothing: If greater than `0` then smooth the labels.
     scope: The scope for the operations performed in computing the loss.
     loss_collection: collection to which the loss will be added.
diff --git a/tensorflow/python/ops/map_fn.py b/tensorflow/python/ops/map_fn.py
index 96810805c18..516f427ad08 100644
--- a/tensorflow/python/ops/map_fn.py
+++ b/tensorflow/python/ops/map_fn.py
@@ -108,31 +108,29 @@ def map_fn(fn,
 
   `fn_output_signature` can be specified using any of the following:
 
-    * A `tf.DType` or `tf.TensorSpec` (to describe a `tf.Tensor`)
-    * A `tf.RaggedTensorSpec` (to describe a `tf.RaggedTensor`)
-    * A `tf.SparseTensorSpec` (to describe a `tf.sparse.SparseTensor`)
-    * A (possibly nested) tuple, list, or dict containing the above types.
+  * A `tf.DType` or `tf.TensorSpec` (to describe a `tf.Tensor`)
+  * A `tf.RaggedTensorSpec` (to describe a `tf.RaggedTensor`)
+  * A `tf.SparseTensorSpec` (to describe a `tf.sparse.SparseTensor`)
+  * A (possibly nested) tuple, list, or dict containing the above types.
 
   #### RaggedTensors
 
   `map_fn` supports `tf.RaggedTensor` inputs and outputs.  In particular:
 
-    * If `elems` is a `RaggedTensor`, then `fn` will be called with each
-      row of that ragged tensor.
+  * If `elems` is a `RaggedTensor`, then `fn` will be called with each
+    row of that ragged tensor.
+    * If `elems` has only one ragged dimension, then the values passed to
+      `fn` will be `tf.Tensor`s.
+    * If `elems` has multiple ragged dimensions, then the values passed to
+      `fn` will be `tf.RaggedTensor`s with one fewer ragged dimension.
 
-      * If `elems` has only one ragged dimension, then the values passed to
-        `fn` will be `tf.Tensor`s.
-      * If `elems` has multiple ragged dimensions, then the values passed to
-        `fn` will be `tf.RaggedTensor`s with one fewer ragged dimension.
-
-    * If the result of `map_fn` should be a `RaggedTensor`, then use a
-      `tf.RaggedTensorSpec` to specify `fn_output_signature`.
-
-      * If `fn` returns `tf.Tensor`s with varying sizes, then use a
-        `tf.RaggedTensorSpec` with `ragged_rank=0` to combine them into a
-        single ragged tensor (which will have ragged_rank=1).
-      * If `fn` returns `tf.RaggedTensor`s, then use a `tf.RaggedTensorSpec`
-        with the same `ragged_rank`.
+  * If the result of `map_fn` should be a `RaggedTensor`, then use a
+    `tf.RaggedTensorSpec` to specify `fn_output_signature`.
+    * If `fn` returns `tf.Tensor`s with varying sizes, then use a
+      `tf.RaggedTensorSpec` with `ragged_rank=0` to combine them into a
+      single ragged tensor (which will have ragged_rank=1).
+    * If `fn` returns `tf.RaggedTensor`s, then use a `tf.RaggedTensorSpec`
+      with the same `ragged_rank`.
 
   >>> # Example: RaggedTensor input
   >>> rt = tf.ragged.constant([[1, 2, 3], [], [4, 5], [6]])
@@ -150,10 +148,10 @@ def map_fn(fn,
   *rows* of a `RaggedTensor`.  If you wish to map a function over the
   individual values, then you should use:
 
-    * `tf.ragged.map_flat_values(fn, rt)`
-      (if fn is expressible as TensorFlow ops)
-    * `rt.with_flat_values(map_fn(fn, rt.flat_values))`
-      (otherwise)
+  * `tf.ragged.map_flat_values(fn, rt)`
+    (if fn is expressible as TensorFlow ops)
+  * `rt.with_flat_values(map_fn(fn, rt.flat_values))`
+    (otherwise)
 
   E.g.:
 
@@ -165,14 +163,14 @@ def map_fn(fn,
 
   `map_fn` supports `tf.sparse.SparseTensor` inputs and outputs.  In particular:
 
-    * If `elems` is a `SparseTensor`, then `fn` will be called with each row
-      of that sparse tensor. In particular, the value passed to `fn` will be a
-      `tf.sparse.SparseTensor` with one fewer dimension than `elems`.
+  * If `elems` is a `SparseTensor`, then `fn` will be called with each row
+    of that sparse tensor. In particular, the value passed to `fn` will be a
+    `tf.sparse.SparseTensor` with one fewer dimension than `elems`.
 
-    * If the result of `map_fn` should be a `SparseTensor`, then use a
-      `tf.SparseTensorSpec` to specify `fn_output_signature`.  The individual
-      `SparseTensor`s returned by `fn` will be stacked into a single
-      `SparseTensor` with one more dimension.
+  * If the result of `map_fn` should be a `SparseTensor`, then use a
+    `tf.SparseTensorSpec` to specify `fn_output_signature`.  The individual
+    `SparseTensor`s returned by `fn` will be stacked into a single
+    `SparseTensor` with one more dimension.
 
   >>> # Example: SparseTensor input
   >>> st = tf.sparse.SparseTensor([[0, 0], [2, 0], [2, 1]], [2, 3, 4], [4, 4])
@@ -195,15 +193,15 @@ def map_fn(fn,
   *rows* of a `SparseTensor`.  If you wish to map a function over the nonzero
   values, then you should use:
 
-    * If the function is expressible as TensorFlow ops, use:
-      ```python
-      tf.sparse.SparseTensor(st.indices, fn(st.values), st.dense_shape)
-      ```
-    * Otherwise, use:
-      ```python
-      tf.sparse.SparseTensor(st.indices, tf.map_fn(fn, st.values),
-                             st.dense_shape)
-      ```
+  * If the function is expressible as TensorFlow ops, use:
+    ```python
+    tf.sparse.SparseTensor(st.indices, fn(st.values), st.dense_shape)
+    ```
+  * Otherwise, use:
+    ```python
+    tf.sparse.SparseTensor(st.indices, tf.map_fn(fn, st.values),
+                           st.dense_shape)
+    ```
 
   #### `map_fn` vs. vectorized operations
 
@@ -215,14 +213,14 @@ def map_fn(fn,
 
   `map_fn` should typically only be used if one of the following is true:
 
-    * It is difficult or expensive to express the desired transform with
-      vectorized operations.
-    * `fn` creates large intermediate values, so an equivalent vectorized
-      transform would take too much memory.
-    * Processing elements in parallel is more efficient than an equivalent
-      vectorized transform.
-    * Efficiency of the transform is not critical, and using `map_fn` is
-      more readable.
+  * It is difficult or expensive to express the desired transform with
+    vectorized operations.
+  * `fn` creates large intermediate values, so an equivalent vectorized
+    transform would take too much memory.
+  * Processing elements in parallel is more efficient than an equivalent
+    vectorized transform.
+  * Efficiency of the transform is not critical, and using `map_fn` is
+    more readable.
 
   E.g., the example given above that maps `fn=lambda t: tf.range(t, t + 3)`
   across `elems` could be rewritten more efficiently using vectorized ops:
@@ -255,7 +253,7 @@ def map_fn(fn,
            [2, 3, 4]], dtype=int32)>
 
 
-  Note that if you use the `tf.function` decorator, any non-TensorFlow Python
+  Note: if you use the `tf.function` decorator, any non-TensorFlow Python
   code that you may have written in your function won't get executed. See
   `tf.function` for more  details. The recommendation would be to debug without
   `tf.function` but switch to it to get performance benefits of running `map_fn`
diff --git a/tensorflow/python/ops/math_grad.py b/tensorflow/python/ops/math_grad.py
index 8ce35de006a..7fe9340ba5c 100644
--- a/tensorflow/python/ops/math_grad.py
+++ b/tensorflow/python/ops/math_grad.py
@@ -29,6 +29,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.ops import gen_math_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import special_math_ops
 
 
 def _safe_shape_div(x, y):
@@ -875,16 +876,42 @@ def _SpenceGrad(op, grad):
     return grad * partial_x
 
 
+@ops.RegisterGradient("BesselI0")
+def _BesselI0Grad(op, grad):
+  """Compute gradient of bessel_i0(x) with respect to its argument."""
+  x = op.inputs[0]
+  with ops.control_dependencies([grad]):
+    partial_x = special_math_ops.bessel_i1(x)
+    return grad * partial_x
+
+
 @ops.RegisterGradient("BesselI0e")
 def _BesselI0eGrad(op, grad):
   """Compute gradient of bessel_i0e(x) with respect to its argument."""
   x = op.inputs[0]
   y = op.outputs[0]
   with ops.control_dependencies([grad]):
-    partial_x = (math_ops.bessel_i1e(x) - math_ops.sign(x) * y)
+    partial_x = (special_math_ops.bessel_i1e(x) - math_ops.sign(x) * y)
     return grad * partial_x
 
 
+@ops.RegisterGradient("BesselI1")
+def _BesselI1Grad(op, grad):
+  """Compute gradient of bessel_i1(x) with respect to its argument."""
+  x = op.inputs[0]
+  y = op.outputs[0]
+  with ops.control_dependencies([grad]):
+    # For x = 0, the correct gradient is 1.0.
+    # However, the main branch gives NaN because of the division by x, so
+    # we impute the gradient manually.
+    # An alternative solution is to express the gradient via bessel_i0 and
+    # bessel_i2, but the latter is not yet implemented in Eigen.
+    dy_dx = array_ops.where_v2(
+        math_ops.equal(x, 0.), math_ops.cast(1., x.dtype),
+        special_math_ops.bessel_i0(x) - math_ops.div(y, x))
+    return grad * dy_dx
+
+
 @ops.RegisterGradient("BesselI1e")
 def _BesselI1eGrad(op, grad):
   """Compute gradient of bessel_i1e(x) with respect to its argument."""
@@ -896,16 +923,104 @@ def _BesselI1eGrad(op, grad):
     # we impute the gradient manually.
     # An alternative solution is to express the gradient via bessel_i0e and
     # bessel_i2e, but the latter is not yet implemented in Eigen.
-    eps = np.finfo(x.dtype.as_numpy_dtype).eps
-    zeros = array_ops.zeros_like(x)
-    x_is_not_tiny = math_ops.abs(x) > eps
-    safe_x = array_ops.where_v2(x_is_not_tiny, x, eps + zeros)
-    dy_dx = math_ops.bessel_i0e(safe_x) - y * (
-        math_ops.sign(safe_x) + math_ops.reciprocal(safe_x))
-    dy_dx = array_ops.where_v2(x_is_not_tiny, dy_dx, 0.5 + zeros)
+    dy_dx = array_ops.where_v2(
+        math_ops.equal(x, 0.), math_ops.cast(0.5, x.dtype),
+        special_math_ops.bessel_i0e(x) - y *
+        (math_ops.sign(x) + math_ops.reciprocal(x)))
     return grad * dy_dx
 
 
+@ops.RegisterGradient("BesselK0")
+def _BesselK0Grad(op, grad):
+  """Compute gradient of bessel_k0(x) with respect to its argument."""
+  x = op.inputs[0]
+  with ops.control_dependencies([grad]):
+    partial_x = -special_math_ops.bessel_k1(x)
+    return grad * partial_x
+
+
+@ops.RegisterGradient("BesselK0e")
+def _BesselK0eGrad(op, grad):
+  """Compute gradient of bessel_k0e(x) with respect to its argument."""
+  x = op.inputs[0]
+  y = op.outputs[0]
+  with ops.control_dependencies([grad]):
+    partial_x = (y - special_math_ops.bessel_k1e(x))
+    return grad * partial_x
+
+
+@ops.RegisterGradient("BesselK1")
+def _BesselK1Grad(op, grad):
+  """Compute gradient of bessel_k1(x) with respect to its argument."""
+  x = op.inputs[0]
+  y = op.outputs[0]
+  with ops.control_dependencies([grad]):
+    # At 0., this is NaN which is fine since the derivative is undefined
+    # at 0.
+    partial_x = -special_math_ops.bessel_k0(x) - math_ops.div(y, x)
+    return grad * partial_x
+
+
+@ops.RegisterGradient("BesselK1e")
+def _BesselK1eGrad(op, grad):
+  """Compute gradient of bessel_k1e(x) with respect to its argument."""
+  x = op.inputs[0]
+  y = op.outputs[0]
+  with ops.control_dependencies([grad]):
+    # At 0., this is NaN which is fine since the derivative is undefined
+    # at 0.
+    partial_x = (
+        y * (1. - math_ops.reciprocal(x)) - special_math_ops.bessel_k0e(x))
+    return grad * partial_x
+
+
+@ops.RegisterGradient("BesselJ0")
+def _BesselJ0Grad(op, grad):
+  """Compute gradient of bessel_j0(x) with respect to its argument."""
+  x = op.inputs[0]
+  with ops.control_dependencies([grad]):
+    partial_x = -special_math_ops.bessel_j1(x)
+    return grad * partial_x
+
+
+@ops.RegisterGradient("BesselJ1")
+def _BesselJ1Grad(op, grad):
+  """Compute gradient of bessel_j1(x) with respect to its argument."""
+  x = op.inputs[0]
+  y = op.outputs[0]
+  with ops.control_dependencies([grad]):
+    # For x = 0, the correct gradient is 0.5.
+    # However, the main branch gives NaN because of the division by x, so
+    # we impute the gradient manually.
+    # An alternative solution is to express the gradient via bessel_i0e and
+    # bessel_i2e, but the latter is not yet implemented in Eigen.
+    dy_dx = array_ops.where_v2(
+        math_ops.equal(x, 0.), math_ops.cast(0.5, x.dtype),
+        special_math_ops.bessel_j0(x) - math_ops.div(y, x))
+    return grad * dy_dx
+
+
+@ops.RegisterGradient("BesselY0")
+def _BesselY0Grad(op, grad):
+  """Compute gradient of bessel_y0(x) with respect to its argument."""
+  x = op.inputs[0]
+  with ops.control_dependencies([grad]):
+    partial_x = -special_math_ops.bessel_y1(x)
+    return grad * partial_x
+
+
+@ops.RegisterGradient("BesselY1")
+def _BesselY1Grad(op, grad):
+  """Compute gradient of bessel_y1(x) with respect to its argument."""
+  x = op.inputs[0]
+  y = op.outputs[0]
+  with ops.control_dependencies([grad]):
+    # At 0., this is NaN which is fine since the derivative is undefined
+    # at 0.
+    partial_x = special_math_ops.bessel_y0(x) - math_ops.div(y, x)
+    return grad * partial_x
+
+
 @ops.RegisterGradient("Igamma")
 def _IgammaGrad(op, grad):
   """Returns gradient of igamma(a, x) with respect to a and x."""
diff --git a/tensorflow/python/ops/math_ops_test.py b/tensorflow/python/ops/math_ops_test.py
index 940966741dc..9699f6d2b78 100644
--- a/tensorflow/python/ops/math_ops_test.py
+++ b/tensorflow/python/ops/math_ops_test.py
@@ -397,7 +397,6 @@ class AddNTest(test_util.TensorFlowTestCase):
         self.assertAllEqual(x[0] * num_inputs,
                             math_ops.add_n([tf_x[0]] * num_inputs))
 
-  @test_util.deprecated_graph_mode_only
   def testGrad(self):
     np.random.seed(42)
     for num_inputs in range(1, 10):
@@ -406,9 +405,16 @@ class AddNTest(test_util.TensorFlowTestCase):
             variables.Variable(10.0 * np.random.random())
             for _ in range(0, num_inputs)
         ]
-        addn = math_ops.add_n(input_vars)
         self.evaluate(variables.global_variables_initializer())
-        add_n_grad = gradients.gradients(addn, input_vars)
+        if context.executing_eagerly():
+          with backprop.GradientTape() as tape:
+            tape.watch(input_vars)
+            addn = math_ops.add_n(input_vars)
+            add_n_grad = tape.gradient(addn, input_vars)
+        else:
+          addn = math_ops.add_n(input_vars)
+          add_n_grad = gradients.gradients(addn, input_vars)
+
         self.assertAllEqual(
             np.repeat(1.0, num_inputs),  # d/dx (x + y + ...) = 1
             [self.evaluate(g) for g in add_n_grad])
@@ -515,18 +521,32 @@ class DivAndModTest(test_util.TensorFlowTestCase):
     _ = math_ops.divide(foo, 1.)
     _ = math_ops.div(foo, 2.)
 
-  @test_util.deprecated_graph_mode_only
   def testFloorDivGrad(self):
     a = variables.Variable(2.)
     b = variables.Variable(4.)
+    input_vars = [a, b]
     self.evaluate(variables.global_variables_initializer())
-    c_grad = gradients.gradients(math_ops.divide(a, b), [a, b])
-    self.assertAllEqual([self.evaluate(x) for x in c_grad], [.25, -.125])
-    c_grad = gradients.gradients(math_ops.div(a, b), [a, b])
-    self.assertAllEqual([self.evaluate(x) for x in c_grad], [.25, -.125])
-    c_grad = gradients.gradients(math_ops.floordiv(a, b), [a, b])
+    if context.executing_eagerly():
+      # TDOO(rmlarsen): Is there a more compact way of
+      # writing this for multiple expressions?
+      with backprop.GradientTape() as tape:
+        tape.watch(input_vars)
+        c_grad0 = tape.gradient(math_ops.divide(a, b), input_vars)
+      with backprop.GradientTape() as tape:
+        tape.watch(input_vars)
+        c_grad1 = tape.gradient(math_ops.div(a, b), input_vars)
+      with backprop.GradientTape() as tape:
+        tape.watch(input_vars)
+        c_grad2 = tape.gradient(math_ops.floordiv(a, b), input_vars)
+    else:
+      c_grad0 = gradients.gradients(math_ops.divide(a, b), input_vars)
+      c_grad1 = gradients.gradients(math_ops.div(a, b), input_vars)
+      c_grad2 = gradients.gradients(math_ops.floordiv(a, b), input_vars)
+    self.assertAllEqual([self.evaluate(x) for x in c_grad0], [.25, -.125])
+    self.assertAllEqual([self.evaluate(x) for x in c_grad1], [.25, -.125])
     self.assertAllEqual(
-        [None if x is None else self.evaluate(x) for x in c_grad], [None, None])
+        [None if x is None else self.evaluate(x) for x in c_grad2],
+        [None, None])
 
   def testConsistent(self):
     nums, divs = self.intTestData()
diff --git a/tensorflow/python/ops/nn_fused_batchnorm_test.py b/tensorflow/python/ops/nn_fused_batchnorm_test.py
index 5497325f6c0..1742a919216 100644
--- a/tensorflow/python/ops/nn_fused_batchnorm_test.py
+++ b/tensorflow/python/ops/nn_fused_batchnorm_test.py
@@ -375,9 +375,10 @@ class BatchNormalizationTest(test.TestCase):
     self.assertLess(err_grad_x_2, err_tolerance)
     self.assertLess(err_grad_scale, err_tolerance)
 
-  def _runtests(self, x_shape, is_training, gradient_test=False):
+  def _runtests(self, x_shape, is_training, gradient_test=False,
+                cpu_only=False):
     use_gpu_vals = [False]
-    if test.is_gpu_available(cuda_only=True):
+    if test.is_gpu_available(cuda_only=True) and not cpu_only:
       use_gpu_vals += [True]
     factors = [1.0, 0.6]
     for dtype in [np.float16, np.float32]:
@@ -438,6 +439,11 @@ class BatchNormalizationTest(test.TestCase):
     x_shape = [0, 131, 127, 6]
     self._runtests(x_shape, False)
 
+  def testInferenceShape6(self):
+    x_shape = [1, 1, 1, 1]
+    # GPU kernel doesn't properly handle case where non-channel dimensions are 1
+    self._runtests(x_shape, False, cpu_only=True)
+
   def testTrainingShape1(self):
     x_shape = [1, 1, 6, 1]
     self._runtests(x_shape, True)
@@ -459,6 +465,11 @@ class BatchNormalizationTest(test.TestCase):
     x_shape = [0, 131, 127, 6]
     self._runtests(x_shape, True)
 
+  def testTrainingShape6(self):
+    x_shape = [1, 1, 1, 1]
+    # GPU kernel doesn't properly handle case where non-channel dimensions are 1
+    self._runtests(x_shape, True, cpu_only=True)
+
   @test_util.run_deprecated_v1
   def testBatchNormGradInferenceShape1(self):
     x_shape = [1, 1, 6, 1]
@@ -485,6 +496,13 @@ class BatchNormalizationTest(test.TestCase):
     x_shape = [0, 7, 11, 4]
     self._runtests(x_shape, is_training=False, gradient_test=True)
 
+  @test_util.run_deprecated_v1
+  def testBatchNormGradInferenceShape6(self):
+    x_shape = [1, 1, 1, 1]
+    # GPU kernel doesn't properly handle case where non-channel dimensions are 1
+    self._runtests(x_shape, is_training=False, gradient_test=True,
+                   cpu_only=True)
+
   @test_util.run_deprecated_v1
   def testBatchNormGradTrainingShape1(self):
     x_shape = [1, 1, 6, 1]
@@ -511,6 +529,12 @@ class BatchNormalizationTest(test.TestCase):
     x_shape = [0, 7, 11, 4]
     self._runtests(x_shape, is_training=True, gradient_test=True)
 
+  @test_util.run_deprecated_v1
+  def testBatchNormGradTrainingShape6(self):
+    x_shape = [1, 1, 1, 1]
+    # GPU kernel doesn't properly handle case where non-channel dimensions are 1
+    self._runtests(x_shape, is_training=True, gradient_test=True, cpu_only=True)
+
   def _testBatchNormGradGrad(self, config):
     shape = config['shape']
     err_tolerance = config['err_tolerance']
diff --git a/tensorflow/python/ops/nn_test.py b/tensorflow/python/ops/nn_test.py
index 6ecd1e015d2..bfe11b63eea 100644
--- a/tensorflow/python/ops/nn_test.py
+++ b/tensorflow/python/ops/nn_test.py
@@ -1004,6 +1004,8 @@ class ReluTest(test_lib.TestCase):
     z = self.evaluate(nn_ops.relu(constant_op.constant(x)))
     self.assertAllEqual(y, z)
 
+  @test_util.disable_xla(
+      "This test relies on undefined behavior that XLA does not replicate")
   @test_util.run_deprecated_v1
   def testNaNs(self):
     # Test that relu(nan) = nan for various sizes.
diff --git a/tensorflow/python/ops/numpy_ops/BUILD b/tensorflow/python/ops/numpy_ops/BUILD
index 5879bc9f062..c4203840c61 100644
--- a/tensorflow/python/ops/numpy_ops/BUILD
+++ b/tensorflow/python/ops/numpy_ops/BUILD
@@ -64,16 +64,6 @@ cuda_py_test(
     ],
 )
 
-cuda_py_test(
-    name = "np_backprop_test",
-    srcs = ["np_backprop_test.py"],
-    deps = [
-        ":numpy",
-        "//tensorflow/python:platform",
-        "@absl_py//absl/testing:parameterized",
-    ],
-)
-
 cuda_py_test(
     name = "np_logic_test",
     srcs = ["np_logic_test.py"],
@@ -115,3 +105,13 @@ cuda_py_test(
         "@absl_py//absl/testing:parameterized",
     ],
 )
+
+cuda_py_test(
+    name = "np_interop_test",
+    srcs = ["np_interop_test.py"],
+    deps = [
+        ":numpy",
+        "//tensorflow:tensorflow_py",
+        "//third_party/py/numpy",
+    ],
+)
diff --git a/tensorflow/python/ops/numpy_ops/README.md b/tensorflow/python/ops/numpy_ops/README.md
new file mode 100644
index 00000000000..111de75044f
--- /dev/null
+++ b/tensorflow/python/ops/numpy_ops/README.md
@@ -0,0 +1,144 @@
+# NumPy API on TensorFlow
+
+This module provides a subset of NumPy API, built on top of TensorFlow
+operations. APIs are based on and have been tested with NumPy 1.16 version.
+
+The set of supported APIs may be expanded over time. Also future releases may
+change the baseline version of NumPy API being supported. A list of some
+systematic differences with NumPy are listed later in the "Differences with
+NumPy" section.
+
+## Getting Started
+
+```python
+import tensorflow as tf
+from tensorflow.python.ops import numpy_ops as np
+print(np.ones([2,1]) + np.ones([1, 2]))
+```
+
+## Types
+
+The module provides an `ndarray` class which wraps an immutable `tf.Tensor`.
+Additional functions are provided which accept array-like objects. Here
+array-like objects includes `ndarrays` as defined by this module, as well as
+`tf.Tensor`, in addition to types accepted by NumPy.
+
+A subset of NumPy dtypes are supported. Type promotion follows NumPy
+semantics.
+
+```python
+print(np.ones([1, 2], dtype=np.int16) + np.ones([2, 1], dtype=np.uint8))
+```
+
+## Array Interface
+
+The `ndarray` class implements the `__array__` interface. This should allow
+these objects to be passed into contexts that expect a NumPy or array-like
+object (e.g. matplotlib).
+
+```python
+import numpy as onp
+onp.sum(np.ones([1, 2]) + onp.ones([2, 1]))
+```
+
+
+## TF Interoperability
+
+The TF-NumPy API calls can be interleaved with TensorFlow calls
+without incurring Tensor data copies. This is true even if the `ndarray` or
+`tf.Tensor` is placed on a non-CPU device.
+
+In general, the expected behavior should be on par with that of code involving
+`tf.Tensor` and running stateless TensorFlow functions on them.
+
+```python
+np.sum(np.ones([1, 2]) + tf.ones([2, 1]))
+```
+
+Note that the `__array_priority__` is currently chosen to be lower than
+`tf.Tensor`. Hence the `+` operator above returns a `tf.Tensor`.
+
+Additional examples of interopability include:
+
+*  using `with tf.GradientTape()` scope to compute gradients through the
+  TF-NumPy API calls.
+*  using `tf.distribution.Strategy` scope for distributed execution
+*  using `tf.vectorized_map()` for speeding up code using auto-vectorization
+
+
+
+## Device Support
+
+Given that `ndarray` and functions wrap TensorFlow constructs, the code will
+have GPU and TPU support on par with TensorFlow. Device placement can be
+controlled by using `with tf.device` scopes. Note that these devices could
+be local or remote.
+
+```python
+with tf.device("GPU:0"):
+  x = np.ones([1, 2])
+print(tf.convert_to_tensor(x).device)
+```
+
+## Graph and Eager Modes
+
+Eager mode execution should typically match NumPy semantics of executing
+op-by-op. However the same code can be executed in graph mode, by putting it
+inside a `tf.function`. The function body can contain NumPy code, and the inputs
+can be `ndarray` as well.
+
+```python
+@tf.function
+def f(x, y):
+  return np.sum(x + y)
+
+f(np.ones([1, 2]), tf.ones([2, 1]))
+```
+Python control flow based on `ndarray` values will be translated by
+[autograph](https://www.tensorflow.org/code/tensorflow/python/autograph/g3doc/reference/index.md)
+into `tf.cond` and `tf.while_loop` constructs. The code can be XLA compiled
+for further optimizations.
+
+However, note that graph mode execution can change behavior of certain
+operations since symbolic execution may not have information that is computed
+during runtime. Some differences are:
+
+*   Shapes can be incomplete or unknown in graph mode. This means that
+    `ndarray.shape`, `ndarray.size` and `ndarray.ndim` can return `ndarray`
+    objects instead of returning integer (or tuple of integer) values.
+*   `__len__`, `__iter__` and `__index__` properties of `ndarray`
+    may similarly not be supported in graph mode. Code using these
+    may need to change to explicit shape operations or control flow
+    constructs.
+*   Also note the [autograph limitations](https://www.tensorflow.org/code/tensorflow/python/autograph/g3doc/reference/limitations.md).
+
+
+## Mutation and Variables
+
+`ndarrays` currently wrap immutable `tf.Tensor`. Hence mutation
+operations like slice assigns are not supported. This may change in the future.
+Note however that one can directly construct a `tf.Variable` and use that with
+the TF-NumPy APIs.
+
+```python
+tf_var = tf.Variable(2.0)
+tf_var.assign_add(np.square(tf_var))
+```
+
+## Differences with NumPy
+
+Here is a non-exhaustive list of differences:
+
+*   Not all dtypes are currently supported. e.g. `np.float96`, `np.float128`.
+    `np.object`, `np.str`, `np.recarray` types are not supported.
+*   `ndarray` storage is in C order only. Fortran order, views, `stride_tricks`
+    are not supported.
+*   Only a subset of functions and modules are supported. This set will be
+    expanded over time. For supported functions, some arguments or argument
+    values may not be supported. This differences are generally provide in the
+    function comments. Full `ufunc` support is also not provided.
+*   Buffer mutation is currently not supported. `ndarrays` wrap immutable
+    tensors. This means that output buffer arguments (e..g `out` in ufuncs) are
+    not supported
+*   NumPy C API is not supported. NumPy's Cython and Swig integration are not
+    supported.
diff --git a/tensorflow/python/ops/numpy_ops/__init__.py b/tensorflow/python/ops/numpy_ops/__init__.py
index 10ace06df9a..0e87036584e 100644
--- a/tensorflow/python/ops/numpy_ops/__init__.py
+++ b/tensorflow/python/ops/numpy_ops/__init__.py
@@ -12,106 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""numpy_ops.
+"""tf.experimental.numpy: Numpy API on top of TensorFlow.
 
-This module provides a subset of numpy API, built on top of TensorFlow
-operations. APIs are based on numpy 1.16 version.
-
-The set of supported APIs may be expanded over time. Also future releases may
-change the baseline version of numpy API being supported. A list of some
-systematic differences with numpy are listed later in the "Differences with
-Numpy" section.
-
-Types
------
-
-The module provide an `ndarray` class which wraps an immutable `tf.Tensor`.
-Additional functions are provided which accept array-like objects. Here
-array-like objects includes `ndarrays` as defined by this module, as well as
-`tf.Tensor`, in addition to types accepted by `numpy`.
-
-A subset of `numpy` dtypes are supported, along with `tf.bfloat16`.
-Additionally, support is provided for selecting the default float type
-(`np.float32` vs `np.float64`) given that some applications may prefer lower
-precision.
-
-Device Support
--------------
-
-Given that `ndarray` and functions wrap TensorFlow constructs, the code will
-have GPU and TPU support on par with TensorFlow. Also the code can be wrapped
-with `tf.function` and XLA compiled. Device placement can be controlled by using
-`with tf.device` scopes.
-
-Graph and Eager Modes
---------------------
-
-Eager mode execution should typically match numpy semantics of executing
-op-by-op. However the same code can be executed in graph mode, by putting it
-inside a `tf.function`. This can change behavior of certain operations since
-symbolic execution may not have information that is computed during runtime.
-
-Some differences are:
-  * Shapes can be incomplete or unknown. This means that `ndarray.shape`,
-    `ndarray.size` and `ndarray.ndim` can return `ndarray` objects instead of
-    returning integer (or tuple of integer) values.
-  * Python control flow based on `ndarray` values may not work and may have to
-    be rewritten to use `tf.cond` or `tf.while_loop`. Note that autograph
-    conversion as part of `tf.function` should still work.
-  * `__len__`, `__iter__` and `__index__` properties of `ndarray` may similarly
-    not work in graph mode.
-
-Mutation and Variables
----------------------
-
-`ndarrays` currently wrap immutable `tf.Tensor`. Also currently mutation
-operations like slice assigns are not supported. This may change in the future.
-
-There is currently no explict construct on par with tf.Variable. However one can
-directly construct a `tf.Variable` and use that with the numpy APIs in this
-module. See section on Interop.
-
-Interop
-------
-
-The numpy API calls can be interleaved with TensorFlow calls without incurring
-Tensor data copies. This is true even if the `ndarray` or `tf.Tensor` is placed
-on a non-CPU device.
-
-Additionally, one could put these calls in a `with tf.GradientTape()` context to
-compute gradients through the numpy API calls. Similarly, code vectorization can
-be done using `tf.vectorized_map()`.
-
-In general, the expected behavior should be on par with that of code involving
-`tf.Tensor` and running stateless TensorFlow functions on them.
-
-Array Interface
---------------
-
-The `ndarray` class implements the `__array__ interface. This should allow these
-objects to be passed into contexts that expect a `numpy` or array-like object
-(e.g. matplotlib).
-
-
-Differences with Numpy
----------------------
-
-Here is a non-exhaustive list of differences:
-  * Not all dtypes are currently supported. e.g. `np.float96`, `np.float128`.
-    `np.object`, `np.str`, `np.recarray` types are not supported.
-  * `ndarray` storage is in C order only. Fortran order, views, stride_tricks
-    are not supported.
-  * Only a subset of functions and modules are supported. This set would be
-    expanded over time. For supported functions, some arguments or argument
-    values may not be supported. This differences are listed in the function
-    comments.
-  * Buffer mutation is currently not supported. `ndarrays` wrap immutable
-    tensors. This means that output buffer arguments (e..g `out` in ufuncs) are
-    not supported
-  * full `ufunc` support is not provided.
-  * Numpy C API is not supported. Numpy's Cython and Swig integration are not
-    supported.
+This module provides a subset of numpy APIs, built on top of TensorFlow
+operations. Please see documentation here:
+https://github.com/tensorflow/tensorflow/tree/master/tensorflow/python/ops/numpy_ops.
 """
+# TODO(wangpeng): Append `np_export`ed symbols to the comments above.
+
 # pylint: disable=g-direct-tensorflow-import
 
 from __future__ import absolute_import
diff --git a/tensorflow/python/ops/numpy_ops/np_array_ops.py b/tensorflow/python/ops/numpy_ops/np_array_ops.py
index 906e53c556d..25241272699 100644
--- a/tensorflow/python/ops/numpy_ops/np_array_ops.py
+++ b/tensorflow/python/ops/numpy_ops/np_array_ops.py
@@ -20,12 +20,14 @@ from __future__ import division
 from __future__ import print_function
 
 import math
+import numbers
 import numpy as np
 import six
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import clip_ops
 from tensorflow.python.ops import control_flow_ops
@@ -39,17 +41,17 @@ from tensorflow.python.ops.numpy_ops import np_utils
 from tensorflow.python.util import nest
 
 
-@np_utils.np_doc(np.empty)
+@np_utils.np_doc('empty')
 def empty(shape, dtype=float):  # pylint: disable=redefined-outer-name
   return zeros(shape, dtype)
 
 
-@np_utils.np_doc(np.empty_like)
+@np_utils.np_doc('empty_like')
 def empty_like(a, dtype=None):
   return zeros_like(a, dtype)
 
 
-@np_utils.np_doc(np.zeros)
+@np_utils.np_doc('zeros')
 def zeros(shape, dtype=float):  # pylint: disable=redefined-outer-name
   dtype = (
       np_utils.result_type(dtype) if dtype else np_dtypes.default_float_type())
@@ -58,7 +60,7 @@ def zeros(shape, dtype=float):  # pylint: disable=redefined-outer-name
   return np_arrays.tensor_to_ndarray(array_ops.zeros(shape, dtype=dtype))
 
 
-@np_utils.np_doc(np.zeros_like)
+@np_utils.np_doc('zeros_like')
 def zeros_like(a, dtype=None):  # pylint: disable=missing-docstring
   if isinstance(a, np_arrays.ndarray):
     a = a.data
@@ -73,7 +75,7 @@ def zeros_like(a, dtype=None):  # pylint: disable=missing-docstring
   return np_arrays.tensor_to_ndarray(array_ops.zeros_like(a, dtype))
 
 
-@np_utils.np_doc(np.ones)
+@np_utils.np_doc('ones')
 def ones(shape, dtype=float):  # pylint: disable=redefined-outer-name
   if dtype:
     dtype = np_utils.result_type(dtype)
@@ -82,7 +84,7 @@ def ones(shape, dtype=float):  # pylint: disable=redefined-outer-name
   return np_arrays.tensor_to_ndarray(array_ops.ones(shape, dtype=dtype))
 
 
-@np_utils.np_doc(np.ones_like)
+@np_utils.np_doc('ones_like')
 def ones_like(a, dtype=None):
   if isinstance(a, np_arrays.ndarray):
     a = a.data
@@ -93,7 +95,7 @@ def ones_like(a, dtype=None):
   return np_arrays.tensor_to_ndarray(array_ops.ones_like(a, dtype))
 
 
-@np_utils.np_doc(np.eye)
+@np_utils.np_doc('eye')
 def eye(N, M=None, k=0, dtype=float):  # pylint: disable=invalid-name,missing-docstring
   if dtype:
     dtype = np_utils.result_type(dtype)
@@ -125,12 +127,12 @@ def eye(N, M=None, k=0, dtype=float):  # pylint: disable=invalid-name,missing-do
       array_ops.matrix_diag(diagonal=diagonal_, num_rows=N, num_cols=M, k=k))
 
 
-@np_utils.np_doc(np.identity)
+@np_utils.np_doc('identity')
 def identity(n, dtype=float):
   return eye(N=n, M=n, dtype=dtype)
 
 
-@np_utils.np_doc(np.full)
+@np_utils.np_doc('full')
 def full(shape, fill_value, dtype=None):  # pylint: disable=redefined-outer-name
   if not isinstance(shape, np_arrays.ndarray):
     shape = asarray(np_arrays.convert_to_tensor(shape, dtype_hint=np.int32))
@@ -142,7 +144,7 @@ def full(shape, fill_value, dtype=None):  # pylint: disable=redefined-outer-name
 
 # Using doc only here since np full_like signature doesn't seem to have the
 # shape argument (even though it exists in the documentation online).
-@np_utils.np_doc_only(np.full_like)
+@np_utils.np_doc_only('full_like')
 def full_like(a, fill_value, dtype=None, order='K', subok=True, shape=None):  # pylint: disable=missing-docstring,redefined-outer-name
   """order, subok and shape arguments mustn't be changed."""
   if order != 'K':
@@ -161,12 +163,14 @@ def full_like(a, fill_value, dtype=None, order='K', subok=True, shape=None):  #
 
 # TODO(wangpeng): investigate whether we can make `copy` default to False.
 # pylint: disable=g-short-docstring-punctuation,g-no-space-after-docstring-summary,g-doc-return-or-yield,g-doc-args
-@np_utils.np_doc_only(np.array)
+@np_utils.np_doc_only('array')
 def array(val, dtype=None, copy=True, ndmin=0):  # pylint: disable=redefined-outer-name
   """Since Tensors are immutable, a copy is made only if val is placed on a
+
   different device than the current one. Even if `copy` is False, a new Tensor
   may need to be built to satisfy `dtype` and `ndim`. This is used only if `val`
-  is an ndarray or a Tensor."""  # pylint:disable=g-docstring-missing-newline
+  is an ndarray or a Tensor.
+  """  # pylint:disable=g-docstring-missing-newline
   if dtype:
     dtype = np_utils.result_type(dtype)
   if isinstance(val, np_arrays.ndarray):
@@ -215,10 +219,12 @@ def array(val, dtype=None, copy=True, ndmin=0):  # pylint: disable=redefined-out
   result_t = np_utils.cond(
       np_utils.greater(ndmin, ndims), true_fn, lambda: result_t)
   return np_arrays.tensor_to_ndarray(result_t)
+
+
 # pylint: enable=g-short-docstring-punctuation,g-no-space-after-docstring-summary,g-doc-return-or-yield,g-doc-args
 
 
-@np_utils.np_doc(np.asarray)
+@np_utils.np_doc('asarray')
 def asarray(a, dtype=None):
   if dtype:
     dtype = np_utils.result_type(dtype)
@@ -227,18 +233,18 @@ def asarray(a, dtype=None):
   return array(a, dtype, copy=False)
 
 
-@np_utils.np_doc(np.asanyarray)
+@np_utils.np_doc('asanyarray')
 def asanyarray(a, dtype=None):
   return asarray(a, dtype)
 
 
-@np_utils.np_doc(np.ascontiguousarray)
+@np_utils.np_doc('ascontiguousarray')
 def ascontiguousarray(a, dtype=None):
   return array(a, dtype, ndmin=1)
 
 
 # Numerical ranges.
-@np_utils.np_doc(np.arange)
+@np_utils.np_doc('arange')
 def arange(start, stop=None, step=1, dtype=None):
   """Returns `step`-separated values in the range [start, stop).
 
@@ -280,7 +286,7 @@ def arange(start, stop=None, step=1, dtype=None):
 
 
 # Building matrices.
-@np_utils.np_doc(np.diag)
+@np_utils.np_doc('diag')
 def diag(v, k=0):  # pylint: disable=missing-docstring
   """Raises an error if input is not 1- or 2-d."""
   v = asarray(v).data
@@ -315,7 +321,7 @@ def diag(v, k=0):  # pylint: disable=missing-docstring
   return np_utils.tensor_to_ndarray(result)
 
 
-@np_utils.np_doc(np.diagonal)
+@np_utils.np_doc('diagonal')
 def diagonal(a, offset=0, axis1=0, axis2=1):  # pylint: disable=missing-docstring
   a = asarray(a).data
 
@@ -346,7 +352,7 @@ def diagonal(a, offset=0, axis1=0, axis2=1):  # pylint: disable=missing-docstrin
   return a
 
 
-@np_utils.np_doc(np.diagflat)
+@np_utils.np_doc('diagflat')
 def diagflat(v, k=0):
   v = asarray(v)
   return diag(array_ops.reshape(v.data, [-1]), k)
@@ -357,21 +363,21 @@ def _promote_dtype(*arrays):
   return [asarray(a, dtype=dtype) for a in arrays]
 
 
-@np_utils.np_doc(np.all)
+@np_utils.np_doc('all')
 def all(a, axis=None, keepdims=None):  # pylint: disable=redefined-builtin
   a = asarray(a, dtype=bool)
   return np_utils.tensor_to_ndarray(
       math_ops.reduce_all(input_tensor=a.data, axis=axis, keepdims=keepdims))
 
 
-@np_utils.np_doc(np.any)
+@np_utils.np_doc('any')
 def any(a, axis=None, keepdims=None):  # pylint: disable=redefined-builtin
   a = asarray(a, dtype=bool)
   return np_utils.tensor_to_ndarray(
       math_ops.reduce_any(input_tensor=a.data, axis=axis, keepdims=keepdims))
 
 
-@np_utils.np_doc(np.compress)
+@np_utils.np_doc('compress')
 def compress(condition, a, axis=None):  # pylint: disable=redefined-outer-name,missing-function-docstring
   condition = asarray(condition, dtype=bool)
   a = asarray(a)
@@ -402,7 +408,7 @@ def compress(condition, a, axis=None):  # pylint: disable=redefined-outer-name,m
       array_ops.boolean_mask(tensor=a_t, mask=condition_t, axis=axis))
 
 
-@np_utils.np_doc(np.copy)
+@np_utils.np_doc('copy')
 def copy(a):
   return array(a, copy=True)
 
@@ -418,7 +424,7 @@ def _maybe_promote_to_int(a):
   return a
 
 
-@np_utils.np_doc(np.cumprod)
+@np_utils.np_doc('cumprod')
 def cumprod(a, axis=None, dtype=None):  # pylint: disable=missing-docstring
   a = asarray(a, dtype=dtype)
 
@@ -434,7 +440,7 @@ def cumprod(a, axis=None, dtype=None):  # pylint: disable=missing-docstring
   return np_utils.tensor_to_ndarray(math_ops.cumprod(a.data, axis))
 
 
-@np_utils.np_doc(np.cumsum)
+@np_utils.np_doc('cumsum')
 def cumsum(a, axis=None, dtype=None):  # pylint: disable=missing-docstring
   a = asarray(a, dtype=dtype)
 
@@ -450,7 +456,7 @@ def cumsum(a, axis=None, dtype=None):  # pylint: disable=missing-docstring
   return np_utils.tensor_to_ndarray(math_ops.cumsum(a.data, axis))
 
 
-@np_utils.np_doc(np.imag)
+@np_utils.np_doc('imag')
 def imag(a):
   a = asarray(a)
   # TODO(srbs): np.imag returns a scalar if a is a scalar, whereas we always
@@ -530,7 +536,7 @@ def _reduce(tf_fn,
       tf_fn(input_tensor=a.data, axis=axis, keepdims=keepdims))
 
 
-@np_utils.np_doc(np.sum)
+@np_utils.np_doc('sum')
 def sum(a, axis=None, dtype=None, keepdims=None):  # pylint: disable=redefined-builtin
   return _reduce(
       math_ops.reduce_sum,
@@ -541,7 +547,7 @@ def sum(a, axis=None, dtype=None, keepdims=None):  # pylint: disable=redefined-b
       tf_bool_fn=math_ops.reduce_any)
 
 
-@np_utils.np_doc(np.prod)
+@np_utils.np_doc('prod')
 def prod(a, axis=None, dtype=None, keepdims=None):
   return _reduce(
       math_ops.reduce_prod,
@@ -552,7 +558,7 @@ def prod(a, axis=None, dtype=None, keepdims=None):
       tf_bool_fn=math_ops.reduce_all)
 
 
-@np_utils.np_doc(np.mean)
+@np_utils.np_doc('mean')
 def mean(a, axis=None, dtype=None, keepdims=None):
   return _reduce(
       math_ops.reduce_mean,
@@ -563,7 +569,7 @@ def mean(a, axis=None, dtype=None, keepdims=None):
       promote_int=_TO_FLOAT)
 
 
-@np_utils.np_doc(np.amax)
+@np_utils.np_doc('amax')
 def amax(a, axis=None, keepdims=None):
   return _reduce(
       math_ops.reduce_max,
@@ -576,7 +582,7 @@ def amax(a, axis=None, keepdims=None):
       preserve_bool=True)
 
 
-@np_utils.np_doc(np.amin)
+@np_utils.np_doc('amin')
 def amin(a, axis=None, keepdims=None):
   return _reduce(
       math_ops.reduce_min,
@@ -589,7 +595,7 @@ def amin(a, axis=None, keepdims=None):
       preserve_bool=True)
 
 
-@np_utils.np_doc(np.var)
+@np_utils.np_doc('var')
 def var(a, axis=None, dtype=None, out=None, ddof=0, keepdims=None):  # pylint: disable=missing-docstring
   if dtype:
     working_dtype = np_utils.result_type(a, dtype)
@@ -636,7 +642,7 @@ def var(a, axis=None, dtype=None, out=None, ddof=0, keepdims=None):  # pylint: d
   return np_utils.tensor_to_ndarray(result)
 
 
-@np_utils.np_doc(np.std)
+@np_utils.np_doc('std')
 def std(a, axis=None, keepdims=None):  # pylint: disable=missing-function-docstring
   return _reduce(
       math_ops.reduce_std,
@@ -647,7 +653,7 @@ def std(a, axis=None, keepdims=None):  # pylint: disable=missing-function-docstr
       promote_int=_TO_FLOAT)
 
 
-@np_utils.np_doc(np.ravel)
+@np_utils.np_doc('ravel')
 def ravel(a):  # pylint: disable=missing-docstring
   a = asarray(a)
   out = np_utils.cond(
@@ -659,7 +665,7 @@ def ravel(a):  # pylint: disable=missing-docstring
 setattr(np_arrays.ndarray, 'ravel', ravel)
 
 
-@np_utils.np_doc(np.real)
+@np_utils.np_doc('real')
 def real(val):
   val = asarray(val)
   # TODO(srbs): np.real returns a scalar if val is a scalar, whereas we always
@@ -667,7 +673,7 @@ def real(val):
   return np_utils.tensor_to_ndarray(math_ops.real(val.data))
 
 
-@np_utils.np_doc(np.repeat)
+@np_utils.np_doc('repeat')
 def repeat(a, repeats, axis=None):  # pylint: disable=missing-docstring
   a = asarray(a).data
   original_shape = a._shape_as_list()  # pylint: disable=protected-access
@@ -698,7 +704,7 @@ def repeat(a, repeats, axis=None):  # pylint: disable=missing-docstring
   return np_utils.tensor_to_ndarray(result)
 
 
-@np_utils.np_doc(np.around)
+@np_utils.np_doc('around')
 def around(a, decimals=0):  # pylint: disable=missing-docstring
   a = asarray(a)
   dtype = a.dtype
@@ -720,7 +726,7 @@ def around(a, decimals=0):  # pylint: disable=missing-docstring
 setattr(np_arrays.ndarray, '__round__', around)
 
 
-@np_utils.np_doc(np.reshape)
+@np_utils.np_doc('reshape')
 def reshape(a, newshape, order='C'):
   """order argument can only b 'C' or 'F'."""
   if order not in {'C', 'F'}:
@@ -752,19 +758,19 @@ def _reshape_method_wrapper(a, *newshape, **kwargs):
   return reshape(a, newshape, order=order)
 
 
-@np_utils.np_doc(np.expand_dims)
+@np_utils.np_doc('expand_dims')
 def expand_dims(a, axis):
   a = asarray(a)
   return np_utils.tensor_to_ndarray(array_ops.expand_dims(a.data, axis=axis))
 
 
-@np_utils.np_doc(np.squeeze)
+@np_utils.np_doc('squeeze')
 def squeeze(a, axis=None):
   a = asarray(a)
   return np_utils.tensor_to_ndarray(array_ops.squeeze(a, axis))
 
 
-@np_utils.np_doc(np.transpose)
+@np_utils.np_doc('transpose')
 def transpose(a, axes=None):
   a = asarray(a)
   if axes is not None:
@@ -772,7 +778,7 @@ def transpose(a, axes=None):
   return np_utils.tensor_to_ndarray(array_ops.transpose(a=a.data, perm=axes))
 
 
-@np_utils.np_doc(np.swapaxes)
+@np_utils.np_doc('swapaxes')
 def swapaxes(a, axis1, axis2):  # pylint: disable=missing-docstring
   a = asarray(a).data
 
@@ -788,7 +794,7 @@ def swapaxes(a, axis1, axis2):  # pylint: disable=missing-docstring
   return np_utils.tensor_to_ndarray(a)
 
 
-@np_utils.np_doc(np.moveaxis)
+@np_utils.np_doc('moveaxis')
 def moveaxis(a, source, destination):  # pylint: disable=missing-docstring
   """Raises ValueError if source, destination not in (-ndim(a), ndim(a))."""
   if not source and not destination:
@@ -846,69 +852,13 @@ def moveaxis(a, source, destination):  # pylint: disable=missing-docstring
   return np_utils.tensor_to_ndarray(a)
 
 
-def _setitem(arr, index, value):
-  """Sets the `value` at `index` in the array `arr`.
-
-  This works by replacing the slice at `index` in the tensor with `value`.
-  Since tensors are immutable, this builds a new tensor using the `tf.concat`
-  op. Currently, only 0-d and 1-d indices are supported.
-
-  Note that this may break gradients e.g.
-
-  a = tf_np.array([1, 2, 3])
-  old_a_t = a.data
-
-  with tf.GradientTape(persistent=True) as g:
-    g.watch(a.data)
-    b = a * 2
-    a[0] = 5
-  g.gradient(b.data, [a.data])  # [None]
-  g.gradient(b.data, [old_a_t])  # [[2., 2., 2.]]
-
-  Here `d_b / d_a` is `[None]` since a.data no longer points to the same
-  tensor.
-
-  Args:
-    arr: array_like.
-    index: scalar or 1-d integer array.
-    value: value to set at index.
-
-  Returns:
-    ndarray
-
-  Raises:
-    ValueError: if `index` is not a scalar or 1-d array.
-  """
-  # TODO(srbs): Figure out a solution to the gradient problem.
-  arr = asarray(arr)
-  index = asarray(index)
-  if index.ndim == 0:
-    index = ravel(index)
-  elif index.ndim > 1:
-    raise ValueError('index must be a scalar or a 1-d array.')
-  value = asarray(value, dtype=arr.dtype)
-  if arr.shape[len(index):] != value.shape:
-    value = full(arr.shape[len(index):], value)
-  prefix_t = arr.data[:index.data[0]]
-  postfix_t = arr.data[index.data[0] + 1:]
-  if len(index) == 1:
-    arr._data = array_ops.concat(  # pylint: disable=protected-access
-        [prefix_t, array_ops.expand_dims(value.data, 0), postfix_t], 0)
-  else:
-    subarray = arr[index.data[0]]
-    _setitem(subarray, index[1:], value)
-    arr._data = array_ops.concat(  # pylint: disable=protected-access
-        [prefix_t, array_ops.expand_dims(subarray.data, 0), postfix_t], 0)
-
-
 # TODO(wangpeng): Make a custom `setattr` that also sets docstring for the
 #   method.
 setattr(np_arrays.ndarray, 'transpose', transpose)
 setattr(np_arrays.ndarray, 'reshape', _reshape_method_wrapper)
-setattr(np_arrays.ndarray, '__setitem__', _setitem)
 
 
-@np_utils.np_doc(np.pad)
+@np_utils.np_doc('pad')
 def pad(ary, pad_width, mode, constant_values=0):
   """Only supports modes 'constant', 'reflect' and 'symmetric' currently."""
   if not (mode == 'constant' or mode == 'reflect' or mode == 'symmetric'):
@@ -924,7 +874,7 @@ def pad(ary, pad_width, mode, constant_values=0):
           constant_values=constant_values))
 
 
-@np_utils.np_doc(np.take)
+@np_utils.np_doc('take')
 def take(a, indices, axis=None, out=None, mode='clip'):
   """out argument is not supported, and default mode is clip."""
   if out is not None:
@@ -951,7 +901,7 @@ def take(a, indices, axis=None, out=None, mode='clip'):
   return np_utils.tensor_to_ndarray(array_ops.gather(a, indices, axis=axis))
 
 
-@np_utils.np_doc_only(np.where)
+@np_utils.np_doc_only('where')
 def where(condition, x=None, y=None):
   """Raises ValueError if exactly one of x or y is not None."""
   condition = asarray(condition, dtype=np.bool_)
@@ -964,7 +914,7 @@ def where(condition, x=None, y=None):
   raise ValueError('Both x and y must be ndarrays, or both must be None.')
 
 
-@np_utils.np_doc(np.select)
+@np_utils.np_doc('select')
 def select(condlist, choicelist, default=0):  # pylint: disable=missing-docstring
   if len(condlist) != len(choicelist):
     msg = 'condlist must have length equal to choicelist ({} vs {})'
@@ -981,19 +931,19 @@ def select(condlist, choicelist, default=0):  # pylint: disable=missing-docstrin
   return output
 
 
-@np_utils.np_doc(np.shape)
+@np_utils.np_doc('shape')
 def shape(a):
   a = asarray(a)
   return a.shape
 
 
-@np_utils.np_doc(np.ndim)
+@np_utils.np_doc('ndim')
 def ndim(a):
   a = asarray(a)
   return a.ndim
 
 
-@np_utils.np_doc(np.isscalar)
+@np_utils.np_doc('isscalar')
 def isscalar(a):
   return ndim(a) == 0
 
@@ -1028,7 +978,7 @@ def _boundaries_to_sizes(a, boundaries, axis):
   return sizes
 
 
-@np_utils.np_doc(np.split)
+@np_utils.np_doc('split')
 def split(ary, indices_or_sections, axis=0):
   ary = asarray(ary)
   if not isinstance(indices_or_sections, six.integer_types):
@@ -1037,26 +987,26 @@ def split(ary, indices_or_sections, axis=0):
   return [np_utils.tensor_to_ndarray(a) for a in result]
 
 
-def _split_on_axis(np_fun, axis):
+def _split_on_axis(np_fun_name, axis):
 
-  @np_utils.np_doc(np_fun)
+  @np_utils.np_doc(np_fun_name)
   def f(ary, indices_or_sections):
     return split(ary, indices_or_sections, axis=axis)
 
   return f
 
 
-vsplit = _split_on_axis(np.vsplit, axis=0)
-hsplit = _split_on_axis(np.hsplit, axis=1)
-dsplit = _split_on_axis(np.dsplit, axis=2)
+vsplit = _split_on_axis('vsplit', axis=0)
+hsplit = _split_on_axis('hsplit', axis=1)
+dsplit = _split_on_axis('dsplit', axis=2)
 
 
-@np_utils.np_doc(np.broadcast_to)
+@np_utils.np_doc('broadcast_to')
 def broadcast_to(array, shape):  # pylint: disable=redefined-outer-name
   return full(shape, array)
 
 
-@np_utils.np_doc(np.stack)
+@np_utils.np_doc('stack')
 def stack(arrays, axis=0):  # pylint: disable=missing-function-docstring
   if isinstance(arrays, (np_arrays.ndarray, ops.Tensor)):
     arrays = asarray(arrays)
@@ -1071,7 +1021,7 @@ def stack(arrays, axis=0):  # pylint: disable=missing-function-docstring
   return asarray(array_ops.stack(unwrapped_arrays, axis))
 
 
-@np_utils.np_doc(np.hstack)
+@np_utils.np_doc('hstack')
 def hstack(tup):
   arrays = [atleast_1d(a) for a in tup]
   arrays = _promote_dtype(*arrays)  # pylint: disable=protected-access
@@ -1085,7 +1035,7 @@ def hstack(tup):
       lambda: array_ops.concat(unwrapped_arrays, axis=1))
 
 
-@np_utils.np_doc(np.vstack)
+@np_utils.np_doc('vstack')
 def vstack(tup):
   arrays = [atleast_2d(a) for a in tup]
   arrays = _promote_dtype(*arrays)  # pylint: disable=protected-access
@@ -1095,7 +1045,7 @@ def vstack(tup):
   return array_ops.concat(unwrapped_arrays, axis=0)
 
 
-@np_utils.np_doc(np.dstack)
+@np_utils.np_doc('dstack')
 def dstack(tup):
   arrays = [atleast_3d(a) for a in tup]
   arrays = _promote_dtype(*arrays)  # pylint: disable=protected-access
@@ -1142,17 +1092,17 @@ def _atleast_nd(n, new_shape, *arys):
     return arys
 
 
-@np_utils.np_doc(np.atleast_1d)
+@np_utils.np_doc('atleast_1d')
 def atleast_1d(*arys):
   return _atleast_nd(1, _pad_left_to, *arys)
 
 
-@np_utils.np_doc(np.atleast_2d)
+@np_utils.np_doc('atleast_2d')
 def atleast_2d(*arys):
   return _atleast_nd(2, _pad_left_to, *arys)
 
 
-@np_utils.np_doc(np.atleast_3d)
+@np_utils.np_doc('atleast_3d')
 def atleast_3d(*arys):  # pylint: disable=missing-docstring
 
   def new_shape(_, old_shape):
@@ -1169,7 +1119,7 @@ def atleast_3d(*arys):  # pylint: disable=missing-docstring
   return _atleast_nd(3, new_shape, *arys)
 
 
-@np_utils.np_doc(np.nonzero)
+@np_utils.np_doc('nonzero')
 def nonzero(a):
   a = atleast_1d(a).data
   if a.shape.rank is None:
@@ -1183,7 +1133,7 @@ def nonzero(a):
           axis=1))
 
 
-@np_utils.np_doc(np.diag_indices)
+@np_utils.np_doc('diag_indices')
 def diag_indices(n, ndim=2):  # pylint: disable=missing-docstring,redefined-outer-name
   if n < 0:
     raise ValueError(
@@ -1196,7 +1146,7 @@ def diag_indices(n, ndim=2):  # pylint: disable=missing-docstring,redefined-oute
   return (math_ops.range(n),) * ndim
 
 
-@np_utils.np_doc(np.tri)
+@np_utils.np_doc('tri')
 def tri(N, M=None, k=0, dtype=None):  # pylint: disable=invalid-name,missing-docstring
   M = M if M is not None else N
   if dtype is not None:
@@ -1223,7 +1173,7 @@ def tri(N, M=None, k=0, dtype=None):  # pylint: disable=invalid-name,missing-doc
   return np_utils.tensor_to_ndarray(r)
 
 
-@np_utils.np_doc(np.tril)
+@np_utils.np_doc('tril')
 def tril(m, k=0):  # pylint: disable=missing-docstring
   m = asarray(m).data
   if m.shape.ndims is None:
@@ -1245,7 +1195,7 @@ def tril(m, k=0):  # pylint: disable=missing-docstring
           array_ops.broadcast_to(mask, array_ops.shape(m)), m, z))
 
 
-@np_utils.np_doc(np.triu)
+@np_utils.np_doc('triu')
 def triu(m, k=0):  # pylint: disable=missing-docstring
   m = asarray(m).data
   if m.shape.ndims is None:
@@ -1267,7 +1217,7 @@ def triu(m, k=0):  # pylint: disable=missing-docstring
           array_ops.broadcast_to(mask, array_ops.shape(m)), z, m))
 
 
-@np_utils.np_doc(np.flip)
+@np_utils.np_doc('flip')
 def flip(m, axis=None):  # pylint: disable=missing-docstring
   m = asarray(m).data
 
@@ -1280,17 +1230,17 @@ def flip(m, axis=None):  # pylint: disable=missing-docstring
   return np_utils.tensor_to_ndarray(array_ops.reverse(m, [axis]))
 
 
-@np_utils.np_doc(np.flipud)
+@np_utils.np_doc('flipud')
 def flipud(m):  # pylint: disable=missing-docstring
   return flip(m, 0)
 
 
-@np_utils.np_doc(np.fliplr)
+@np_utils.np_doc('fliplr')
 def fliplr(m):  # pylint: disable=missing-docstring
   return flip(m, 1)
 
 
-@np_utils.np_doc(np.roll)
+@np_utils.np_doc('roll')
 def roll(a, shift, axis=None):  # pylint: disable=missing-docstring
   a = asarray(a).data
 
@@ -1303,7 +1253,7 @@ def roll(a, shift, axis=None):  # pylint: disable=missing-docstring
   return np_utils.tensor_to_ndarray(array_ops.reshape(a, original_shape))
 
 
-@np_utils.np_doc(np.rot90)
+@np_utils.np_doc('rot90')
 def rot90(m, k=1, axes=(0, 1)):  # pylint: disable=missing-docstring
   m_rank = array_ops.rank(m)
   ax1, ax2 = np_utils._canonicalize_axes(axes, m_rank)  # pylint: disable=protected-access
@@ -1323,7 +1273,7 @@ def rot90(m, k=1, axes=(0, 1)):  # pylint: disable=missing-docstring
       return flip(transpose(m, perm), ax2)
 
 
-@np_utils.np_doc(np.vander)
+@np_utils.np_doc('vander')
 def vander(x, N=None, increasing=False):  # pylint: disable=missing-docstring,invalid-name
   x = asarray(x).data
 
@@ -1362,7 +1312,7 @@ def vander(x, N=None, increasing=False):  # pylint: disable=missing-docstring,in
           x, math_ops.cast(math_ops.range(start, limit, delta), dtype=x.dtype)))
 
 
-@np_utils.np_doc(np.ix_)
+@np_utils.np_doc('ix_')
 def ix_(*args):  # pylint: disable=missing-docstring
   n = len(args)
   output = []
@@ -1394,7 +1344,7 @@ def ix_(*args):  # pylint: disable=missing-docstring
   return output
 
 
-@np_utils.np_doc(np.broadcast_arrays)
+@np_utils.np_doc('broadcast_arrays')
 def broadcast_arrays(*args, **kwargs):  # pylint: disable=missing-docstring
   subok = kwargs.pop('subok', False)
   if subok:
@@ -1407,7 +1357,7 @@ def broadcast_arrays(*args, **kwargs):  # pylint: disable=missing-docstring
   return [np_utils.tensor_to_ndarray(arg) for arg in args]
 
 
-@np_utils.np_doc_only(np.sign)
+@np_utils.np_doc_only('sign')
 def sign(x, out=None, where=None, **kwargs):  # pylint: disable=missing-docstring,redefined-outer-name
   if out:
     raise ValueError('tf.numpy doesnt support setting out.')
@@ -1428,7 +1378,7 @@ def sign(x, out=None, where=None, **kwargs):  # pylint: disable=missing-docstrin
 
 # Note that np.take_along_axis may not be present in some supported versions of
 # numpy.
-@np_utils.np_doc(None, np_fun_name='take_along_axis')
+@np_utils.np_doc('take_along_axis')
 def take_along_axis(arr, indices, axis):  # pylint: disable=missing-docstring
   arr = asarray(arr)
   indices = asarray(indices)
@@ -1446,14 +1396,13 @@ def take_along_axis(arr, indices, axis):  # pylint: disable=missing-docstring
   # broadcast.
   arr_shape_original = array_ops.shape(arr)
   indices_shape_original = array_ops.shape(indices)
-  arr_shape = array_ops.tensor_scatter_update(
-      arr_shape_original, [[axis]], [1])
-  indices_shape = array_ops.tensor_scatter_update(
-      indices_shape_original, [[axis]], [1])
-  broadcasted_shape = array_ops.broadcast_dynamic_shape(
-      arr_shape, indices_shape)
-  arr_shape = array_ops.tensor_scatter_update(
-      broadcasted_shape, [[axis]], [arr_shape_original[axis]])
+  arr_shape = array_ops.tensor_scatter_update(arr_shape_original, [[axis]], [1])
+  indices_shape = array_ops.tensor_scatter_update(indices_shape_original,
+                                                  [[axis]], [1])
+  broadcasted_shape = array_ops.broadcast_dynamic_shape(arr_shape,
+                                                        indices_shape)
+  arr_shape = array_ops.tensor_scatter_update(broadcasted_shape, [[axis]],
+                                              [arr_shape_original[axis]])
   indices_shape = array_ops.tensor_scatter_update(
       broadcasted_shape, [[axis]], [indices_shape_original[axis]])
   arr = array_ops.broadcast_to(arr, arr_shape)
@@ -1468,10 +1417,10 @@ def take_along_axis(arr, indices, axis):  # pylint: disable=missing-docstring
   swapaxes_ = lambda t: swapaxes(np_utils.tensor_to_ndarray(t), axis, -1).data
 
   dont_move_axis_to_end = math_ops.equal(axis, rank - 1)
-  arr = np_utils.cond(
-      dont_move_axis_to_end, lambda: arr, lambda: swapaxes_(arr))
-  indices = np_utils.cond(
-      dont_move_axis_to_end, lambda: indices, lambda: swapaxes_(indices))
+  arr = np_utils.cond(dont_move_axis_to_end, lambda: arr,
+                      lambda: swapaxes_(arr))
+  indices = np_utils.cond(dont_move_axis_to_end, lambda: indices,
+                          lambda: swapaxes_(indices))
 
   arr_shape = array_ops.shape(arr)
   arr = array_ops.reshape(arr, [-1, arr_shape[-1]])
@@ -1481,8 +1430,231 @@ def take_along_axis(arr, indices, axis):  # pylint: disable=missing-docstring
 
   result = array_ops.gather(arr, indices, batch_dims=1)
   result = array_ops.reshape(result, indices_shape)
-  result = np_utils.cond(
-      dont_move_axis_to_end, lambda: result, lambda: swapaxes_(result))
+  result = np_utils.cond(dont_move_axis_to_end, lambda: result,
+                         lambda: swapaxes_(result))
   result.set_shape(possible_result_shape)
 
-  return  np_utils.tensor_to_ndarray(result)
+  return np_utils.tensor_to_ndarray(result)
+
+
+_SLICE_ERORR = (
+    'only integers, slices (`:`), ellipsis (`...`), '
+    'numpy.newaxis (`None`) and integer or boolean arrays are valid indices')
+
+
+def _as_index(idx, need_scalar=True):
+  """Helper function to parse idx as an index.
+
+  Args:
+    idx: index
+    need_scalar: If idx needs to be a scalar value.
+
+  Returns:
+    A pair, (indx, bool). First one is the parsed index and can be a tensor,
+    or scalar integer / Dimension. Second one is True if rank is known to be 0.
+
+  Raises:
+    IndexError: For incorrect indices.
+  """
+  if isinstance(idx, (numbers.Integral, tensor_shape.Dimension)):
+    return idx, True
+  data = asarray(idx).data
+  if data.dtype == dtypes.bool:
+    if data.shape.ndims != 1:
+      # TODO(agarwal): handle higher rank boolean masks.
+      raise NotImplementedError('Need rank 1 for bool index %s' % idx)
+    data = array_ops.where_v2(data)
+    data = array_ops.reshape(data, [-1])
+  if need_scalar and data.shape.rank not in (None, 0):
+    raise IndexError(_SLICE_ERORR + ', got {!r}'.format(idx))
+  np_dtype = data.dtype.as_numpy_dtype
+  if not np.issubdtype(np_dtype, np.integer):
+    raise IndexError(_SLICE_ERORR + ', got {!r}'.format(idx))
+  if data.dtype not in (dtypes.int64, dtypes.int32):
+    # TF slicing can only handle int32/int64. So we need to cast.
+    promoted_dtype = np.promote_types(np.int32, np_dtype)
+    if promoted_dtype == np.int32:
+      data = math_ops.cast(data, dtypes.int32)
+    elif promoted_dtype == np.int64:
+      data = math_ops.cast(data, dtypes.int64)
+    else:
+      raise IndexError(_SLICE_ERORR + ', got {!r}'.format(idx))
+  return data, data.shape.rank == 0
+
+
+def _slice_helper(tensor, slice_spec):
+  """Helper function for __getitem__."""
+  begin, end, strides = [], [], []
+  new_axis_mask, shrink_axis_mask = 0, 0
+  begin_mask, end_mask = 0, 0
+  ellipsis_mask = 0
+  advanced_indices = []
+  shrink_indices = []
+  for index, s in enumerate(slice_spec):
+    if isinstance(s, slice):
+      if s.start is not None:
+        begin.append(_as_index(s.start)[0])
+      else:
+        begin.append(0)
+        begin_mask |= (1 << index)
+      if s.stop is not None:
+        end.append(_as_index(s.stop)[0])
+      else:
+        end.append(0)
+        end_mask |= (1 << index)
+      if s.step is not None:
+        strides.append(_as_index(s.step)[0])
+      else:
+        strides.append(1)
+    elif s is Ellipsis:
+      begin.append(0)
+      end.append(0)
+      strides.append(1)
+      ellipsis_mask |= (1 << index)
+    elif s is array_ops.newaxis:
+      begin.append(0)
+      end.append(0)
+      strides.append(1)
+      new_axis_mask |= (1 << index)
+    else:
+      s, is_scalar = _as_index(s, False)
+      if is_scalar:
+        begin.append(s)
+        end.append(s + 1)
+        strides.append(1)
+        shrink_axis_mask |= (1 << index)
+        shrink_indices.append(index)
+      else:
+        begin.append(0)
+        end.append(0)
+        strides.append(1)
+        begin_mask |= (1 << index)
+        end_mask |= (1 << index)
+        advanced_indices.append((index, s, ellipsis_mask != 0))
+
+  # stack possibly involves no tensors, so we must use op_scope correct graph.
+  with ops.name_scope(
+      None,
+      'strided_slice', [tensor] + begin + end + strides,
+      skip_on_eager=False) as name:
+    if begin:
+      packed_begin, packed_end, packed_strides = (array_ops.stack(begin),
+                                                  array_ops.stack(end),
+                                                  array_ops.stack(strides))
+      if (packed_begin.dtype == dtypes.int64 or
+          packed_end.dtype == dtypes.int64 or
+          packed_strides.dtype == dtypes.int64):
+        if packed_begin.dtype != dtypes.int64:
+          packed_begin = math_ops.cast(packed_begin, dtypes.int64)
+        if packed_end.dtype != dtypes.int64:
+          packed_end = math_ops.cast(packed_end, dtypes.int64)
+        if packed_strides.dtype != dtypes.int64:
+          packed_strides = math_ops.cast(packed_strides, dtypes.int64)
+    else:
+      var_empty = constant_op.constant([], dtype=dtypes.int32)
+      packed_begin = packed_end = packed_strides = var_empty
+    # TODO(agarwal): set_shape on tensor to set rank.
+    tensor = array_ops.strided_slice(
+        tensor,
+        packed_begin,
+        packed_end,
+        packed_strides,
+        begin_mask=begin_mask,
+        end_mask=end_mask,
+        shrink_axis_mask=shrink_axis_mask,
+        new_axis_mask=new_axis_mask,
+        ellipsis_mask=ellipsis_mask,
+        name=name)
+    if not advanced_indices:
+      return tensor
+    advanced_indices_map = {}
+    for index, data, had_ellipsis in advanced_indices:
+      if had_ellipsis:
+        num_shrink = len([x for x in shrink_indices if x > index])
+        dim = index - len(slice_spec) + num_shrink
+      else:
+        num_shrink = len([x for x in shrink_indices if x < index])
+        dim = index - num_shrink
+      advanced_indices_map[dim] = data
+    dims = sorted(advanced_indices_map.keys())
+    dims_contiguous = True
+    if len(dims) > 1:
+      if dims[0] < 0 and dims[-1] >= 0:  # not all same sign
+        dims_contiguous = False
+      else:
+        for i in range(len(dims) - 1):
+          if dims[i] + 1 != dims[i + 1]:
+            dims_contiguous = False
+            break
+    indices = [advanced_indices_map[x] for x in dims]
+    indices = [x.data for x in _promote_dtype(*indices)]
+    indices = np_utils.tf_broadcast(*indices)
+    stacked_indices = array_ops.stack(indices, axis=-1)
+    if not dims_contiguous:
+      tensor = moveaxis(tensor, dims, range(len(dims))).data
+      tensor_shape_prefix = array_ops.shape(
+          tensor, out_type=stacked_indices.dtype)[:len(dims)]
+      stacked_indices = array_ops.where_v2(
+          stacked_indices < 0, stacked_indices + tensor_shape_prefix,
+          stacked_indices)
+      return array_ops.gather_nd(tensor, stacked_indices)
+    # Note that gather_nd does not support gathering from inside the array.
+    # To avoid shuffling data back and forth, we transform the indices and
+    # do a gather instead.
+    rank = np_utils._maybe_static(array_ops.rank(tensor))  # pylint: disable=protected-access
+    dims = [(x + rank if x < 0 else x) for x in dims]
+    shape_tensor = array_ops.shape(tensor, out_type=stacked_indices.dtype)
+    dim_sizes = array_ops.gather(shape_tensor, dims)
+    if len(dims) == 1:
+      stacked_indices = indices[0]
+    stacked_indices = array_ops.where_v2(stacked_indices < 0,
+                                         stacked_indices + dim_sizes,
+                                         stacked_indices)
+    axis = dims[0]
+    if len(dims) > 1:
+      index_scaling = math_ops.cumprod(
+          dim_sizes, reverse=True, exclusive=True)
+      stacked_indices = math_ops.tensordot(
+          stacked_indices, index_scaling, axes=1)
+      flat_shape = array_ops.concat(
+          [shape_tensor[:axis], [-1], shape_tensor[axis + len(dims):]],
+          axis=0)
+      tensor = array_ops.reshape(tensor, flat_shape)
+
+    return array_ops.gather(tensor, stacked_indices, axis=axis)
+
+
+def _as_spec_tuple(slice_spec):
+  """Convert slice_spec to tuple."""
+  if isinstance(slice_spec,
+                (list, tuple)) and not isinstance(slice_spec, np.ndarray):
+    is_index = True
+    for s in slice_spec:
+      if s is None or s is Ellipsis or isinstance(s, (list, tuple, slice)):
+        is_index = False
+        break
+      elif isinstance(s, (np_arrays.ndarray, np.ndarray)) and s.ndim != 0:
+        is_index = False
+        break
+    if not is_index:
+      return tuple(slice_spec)
+  return (slice_spec,)
+
+
+def _getitem(self, slice_spec):
+  """Implementation of ndarray.__getitem__."""
+  if (isinstance(slice_spec, bool) or (isinstance(slice_spec, ops.Tensor) and
+                                       slice_spec.dtype == dtypes.bool) or
+      (isinstance(slice_spec, (np.ndarray, np_arrays.ndarray)) and
+       slice_spec.dtype == np.bool)):
+    return np_utils.tensor_to_ndarray(
+        array_ops.boolean_mask(tensor=self.data, mask=slice_spec))
+
+  if not isinstance(slice_spec, tuple):
+    slice_spec = _as_spec_tuple(slice_spec)
+
+  result_t = _slice_helper(self.data, slice_spec)
+  return np_utils.tensor_to_ndarray(result_t)
+
+
+setattr(np_arrays.ndarray, '__getitem__', _getitem)
diff --git a/tensorflow/python/ops/numpy_ops/np_array_ops_test.py b/tensorflow/python/ops/numpy_ops/np_array_ops_test.py
index 25a6a53507a..d52e0c4ea83 100644
--- a/tensorflow/python/ops/numpy_ops/np_array_ops_test.py
+++ b/tensorflow/python/ops/numpy_ops/np_array_ops_test.py
@@ -911,26 +911,6 @@ class ArrayMethodsTest(test.TestCase):
     run_test(np.arange(30).reshape(2, 3, 5).tolist(), [2, 0, 1])
     run_test(np.arange(30).reshape(2, 3, 5).tolist(), [2, 1, 0])
 
-  def testSetItem(self):
-
-    def run_test(arr, index, value):
-      for fn in self.array_transforms:
-        value_arg = fn(value)
-        tf_array = np_array_ops.array(arr)
-        np_array = np.array(arr)
-        tf_array[index] = value_arg
-        # TODO(srbs): "setting an array element with a sequence" is thrown
-        # if we do not wrap value_arg in a numpy array. Investigate how this can
-        # be avoided.
-        np_array[index] = np.array(value_arg)
-        self.match(tf_array, np_array)
-
-    run_test([1, 2, 3], 1, 5)
-    run_test([[1, 2], [3, 4]], 0, [6, 7])
-    run_test([[1, 2], [3, 4]], 1, [6, 7])
-    run_test([[1, 2], [3, 4]], (0, 1), 6)
-    run_test([[1, 2], [3, 4]], 0, 6)  # Value needs to broadcast.
-
   def match_shape(self, actual, expected, msg=None):
     if msg:
       msg = 'Shape match failed for: {}. Expected: {} Actual: {}'.format(
diff --git a/tensorflow/python/ops/numpy_ops/np_arrays.py b/tensorflow/python/ops/numpy_ops/np_arrays.py
index e2f73100909..77157544e8f 100644
--- a/tensorflow/python/ops/numpy_ops/np_arrays.py
+++ b/tensorflow/python/ops/numpy_ops/np_arrays.py
@@ -20,135 +20,17 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import numbers
 import numpy as np
 import six
 
-from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import composite_tensor
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import tensor_spec
+from tensorflow.python.framework import type_spec
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.numpy_ops import np_dtypes
-from tensorflow.python.util import nest
-
-
-_SLICE_TYPE_ERROR = (
-    'Only integers, slices (`:`), ellipsis (`...`), '
-    'tf.newaxis (`None`) and scalar tf.int32/tf.int64 tensors are valid '
-    'indices')
-
-_SUPPORTED_SLICE_DTYPES = (dtypes.int32, dtypes.int32_ref, dtypes.int64,
-                           dtypes.int64_ref)
-
-
-def _check_index(idx):
-  """Check if a given value is a valid index into a tensor."""
-  if isinstance(idx, (numbers.Integral, tensor_shape.Dimension)):
-    return
-
-  # Optimistic check. Assumptions:
-  # * any object with a dtype is supported
-  # * any object with a dtype has a sizeable shape attribute.
-  dtype = getattr(idx, 'dtype', None)
-  if (dtype is None or dtypes.as_dtype(dtype) not in _SUPPORTED_SLICE_DTYPES or
-      idx.shape and len(idx.shape) == 1):
-    # TODO(slebedev): IndexError seems more appropriate here, but it
-    # will break `_slice_helper` contract.
-    raise TypeError(_SLICE_TYPE_ERROR + ', got {!r}'.format(idx))
-
-
-def _is_undefined_dimension(d):
-  return isinstance(d, tensor_shape.Dimension) and d.value is None
-
-
-def _slice_helper(tensor, slice_spec, var=None):
-  """Copied from array_ops._slice_helper, will be merged back later."""
-  if isinstance(slice_spec, bool) or \
-  (isinstance(slice_spec, ops.Tensor) and slice_spec.dtype == dtypes.bool) or \
-  (isinstance(slice_spec, np.ndarray) and slice_spec.dtype == bool):
-    return array_ops.boolean_mask(tensor=tensor, mask=slice_spec)
-
-  if not isinstance(slice_spec, (list, tuple)):
-    slice_spec = [slice_spec]
-
-  begin, end, strides = [], [], []
-  index = 0
-
-  new_axis_mask, shrink_axis_mask = 0, 0
-  begin_mask, end_mask = 0, 0
-  ellipsis_mask = 0
-  for s in slice_spec:
-    if isinstance(s, slice):
-      if s.start is not None and not _is_undefined_dimension(s.start):
-        _check_index(s.start)
-        begin.append(s.start)
-      else:
-        begin.append(0)
-        begin_mask |= (1 << index)
-      if s.stop is not None and not _is_undefined_dimension(s.stop):
-        _check_index(s.stop)
-        end.append(s.stop)
-      else:
-        end.append(0)
-        end_mask |= (1 << index)
-      if s.step is not None and not _is_undefined_dimension(s.step):
-        _check_index(s.step)
-        strides.append(s.step)
-      else:
-        strides.append(1)
-    elif s is Ellipsis:
-      begin.append(0)
-      end.append(0)
-      strides.append(1)
-      ellipsis_mask |= (1 << index)
-    elif s is array_ops.newaxis:
-      begin.append(0)
-      end.append(0)
-      strides.append(1)
-      new_axis_mask |= (1 << index)
-    else:
-      _check_index(s)
-      begin.append(s)
-      end.append(s + 1)
-      strides.append(1)
-      shrink_axis_mask |= (1 << index)
-    index += 1
-
-  # stack possibly involves no tensors, so we must use op_scope correct graph.
-  with ops.name_scope(
-      None,
-      'strided_slice', [tensor] + begin + end + strides,
-      skip_on_eager=False) as name:
-    if begin:
-      packed_begin, packed_end, packed_strides = (array_ops.stack(begin),
-                                                  array_ops.stack(end),
-                                                  array_ops.stack(strides))
-      if (packed_begin.dtype == dtypes.int64 or
-          packed_end.dtype == dtypes.int64 or
-          packed_strides.dtype == dtypes.int64):
-        if packed_begin.dtype != dtypes.int64:
-          packed_begin = math_ops.cast(packed_begin, dtypes.int64)
-        if packed_end.dtype != dtypes.int64:
-          packed_end = math_ops.cast(packed_end, dtypes.int64)
-        if packed_strides.dtype != dtypes.int64:
-          packed_strides = math_ops.cast(packed_strides, dtypes.int64)
-    else:
-      var_empty = constant_op.constant([], dtype=dtypes.int32)
-      packed_begin = packed_end = packed_strides = var_empty
-    return array_ops.strided_slice(
-        tensor,
-        packed_begin,
-        packed_end,
-        packed_strides,
-        begin_mask=begin_mask,
-        end_mask=end_mask,
-        shrink_axis_mask=shrink_axis_mask,
-        new_axis_mask=new_axis_mask,
-        ellipsis_mask=ellipsis_mask,
-        var=var,
-        name=name)
 
 
 def convert_to_tensor(value, dtype=None, dtype_hint=None):
@@ -175,7 +57,38 @@ def convert_to_tensor(value, dtype=None, dtype_hint=None):
   return ops.convert_to_tensor(value, dtype=dtype, dtype_hint=dtype_hint)
 
 
-class ndarray(object):  # pylint: disable=invalid-name
+class NdarraySpec(type_spec.BatchableTypeSpec):
+  """Type specification for a `tf.experiemntal.numpy.ndarray`."""
+
+  value_type = property(lambda self: ndarray)
+
+  def __init__(self, data_spec):
+    if not isinstance(data_spec, tensor_spec.TensorSpec):
+      raise ValueError('NdarraySpec.__init__ was expecting a tf.TypeSpec, '
+                       'but got a {} instead.'.format(type(data_spec)))
+    self._data_spec = data_spec
+
+  @property
+  def _component_specs(self):
+    return self._data_spec
+
+  def _to_components(self, value):
+    return value.data
+
+  def _from_components(self, data):
+    return tensor_to_ndarray(data)
+
+  def _serialize(self):
+    return (self._data_spec,)
+
+  def _batch(self, batch_size):
+    return NdarraySpec(self._data_spec._batch(batch_size))  # pylint: disable=protected-access
+
+  def _unbatch(self):
+    return NdarraySpec(self._data_spec._unbatch())  # pylint: disable=protected-access
+
+
+class ndarray(composite_tensor.CompositeTensor):  # pylint: disable=invalid-name
   """Equivalent of numpy.ndarray backed by TensorFlow tensors.
 
   This does not support all features of NumPy ndarrays e.g. strides and
@@ -228,15 +141,30 @@ class ndarray(object):  # pylint: disable=invalid-name
         raise ValueError('Unexpected type for `buffer` {}. Must be an ndarray,'
                          ' Tensor or np.ndarray.'.format(type(buffer)))
 
-      if shape is not None and tuple(shape) != buffer._shape_tuple():  # pylint: disable=protected-access
-        # TODO(srbs): NumPy allows this. Investigate if/how to support this.
-        raise ValueError('shape arg must match buffer.shape.')
+      if shape is not None:
+        buffer.set_shape(shape)
 
     assert isinstance(buffer, ops.Tensor)
     if dtype and dtype != buffer.dtype:
-      buffer = array_ops.bitcast(buffer, dtype)
+      buffer = math_ops.cast(buffer, dtype)
     self._data = buffer
-    self.base = None
+    self._type_spec_internal = None
+
+  @classmethod
+  def from_tensor(cls, tensor):
+    o = cls.__new__(cls, None)
+    # pylint: disable=protected-access
+    o._data = tensor
+    o._type_spec_internal = None
+    # pylint: enable=protected-access
+    return o
+
+  @property
+  def _type_spec(self):
+    if self._type_spec_internal is None:
+      self._type_spec_internal = NdarraySpec(
+          type_spec.type_spec_from_value(self._data))
+    return self._type_spec_internal
 
   @property
   def data(self):
@@ -324,22 +252,6 @@ class ndarray(object):  # pylint: disable=invalid-name
   def __bool__(self):
     return self.__nonzero__()
 
-  def __getitem__(self, slice_spec):
-    # TODO(srbs): Need to support better indexing.
-    def _gettensor(x):
-      if isinstance(x, ndarray):
-        x = x.data
-      if isinstance(x, ops.Tensor) and x.dtype not in (
-          dtypes.int32, dtypes.int64):
-        # Currently _slice_helper will only work with int32/int64 tensors, but
-        # type inference by numpy can create {u,}int{8,16}, so just cast.
-        x = math_ops.cast(x, dtypes.int32)
-      return x
-    slice_spec = nest.map_structure(_gettensor, slice_spec)
-
-    result_t = _slice_helper(self.data, slice_spec)
-    return tensor_to_ndarray(result_t)
-
   def __iter__(self):
     if not isinstance(self.data, ops.EagerTensor):
       raise TypeError('Iteration over symbolic tensor is not allowed')
@@ -362,7 +274,8 @@ class ndarray(object):  # pylint: disable=invalid-name
     """
     return np.asarray(self.data, dtype)
 
-  __array_priority__ = 110
+  # NOTE: we currently prefer interop with TF to allow TF to take precedence.
+  __array_priority__ = 90
 
   def __index__(self):
     """Returns a python scalar.
@@ -381,7 +294,7 @@ class ndarray(object):  # pylint: disable=invalid-name
     # TODO(wangpeng): Handle graph mode
     if not isinstance(self.data, ops.EagerTensor):
       raise TypeError('Indexing using symbolic tensor is not allowed')
-    return np.asscalar(self.data.numpy())
+    return self.data.numpy().item()
 
   def tolist(self):
     return self.data.numpy().tolist()
@@ -394,7 +307,7 @@ class ndarray(object):  # pylint: disable=invalid-name
 
 
 def tensor_to_ndarray(tensor):
-  return ndarray(tensor._shape_tuple(), dtype=tensor.dtype, buffer=tensor)  # pylint: disable=protected-access
+  return ndarray.from_tensor(tensor)
 
 
 def ndarray_to_tensor(arr, dtype=None, name=None, as_ref=False):
diff --git a/tensorflow/python/ops/numpy_ops/np_arrays_test.py b/tensorflow/python/ops/numpy_ops/np_arrays_test.py
index feced98438d..ab407d2bfcf 100644
--- a/tensorflow/python/ops/numpy_ops/np_arrays_test.py
+++ b/tensorflow/python/ops/numpy_ops/np_arrays_test.py
@@ -22,6 +22,7 @@ import collections
 
 import numpy as np
 
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
@@ -29,6 +30,7 @@ from tensorflow.python.ops.numpy_ops import np_arrays
 # Required for operator overloads
 from tensorflow.python.ops.numpy_ops import np_math_ops  # pylint: disable=unused-import
 from tensorflow.python.platform import test
+from tensorflow.python.util import nest
 
 t2a = np_arrays.tensor_to_ndarray
 
@@ -50,6 +52,19 @@ class ArrayTest(test.TestCase):
     self.assertIs(a.dtype.type, np.bool_)
     self.assertAllEqual([False, True], a)
 
+  def testConstructor(self):
+    t = constant_op.constant([[1], [1]])
+    a = np_arrays.ndarray(shape=(2, 1), buffer=t)
+    self.assertAllEqual(t, a)
+    self.assertEqual(dtypes.float64, a.dtype)
+
+    a = np_arrays.ndarray(shape=(2, 1), dtype=dtypes.int32, buffer=t)
+    self.assertAllEqual(t, a)
+    self.assertEqual(dtypes.int32, a.dtype)
+
+    with self.assertRaises(ValueError):  # bad shape
+      _ = np_arrays.ndarray((2, 2), buffer=t)
+
   def testNeg(self):
     a = t2a(ops.convert_to_tensor(value=[1.0, 2.0]))
     self.assertAllEqual([-1.0, -2.0], -a)
@@ -182,6 +197,23 @@ class ArrayTest(test.TestCase):
     with self.assertRaisesWithPredicateMatch(TypeError, r'unhashable type'):
       hash(a)
 
+  def testFromToCompositeTensor(self):
+    tensors = [t2a(ops.convert_to_tensor(0.1)), t2a(ops.convert_to_tensor(0.2))]
+
+    flattened = nest.flatten(tensors, expand_composites=True)
+    # Each ndarray contains only one tensor, so the flattened output should be
+    # just 2 tensors in a list.
+    self.assertLen(flattened, 2)
+    self.assertIsInstance(flattened[0], ops.Tensor)
+    self.assertIsInstance(flattened[1], ops.Tensor)
+
+    repacked = nest.pack_sequence_as(tensors, flattened, expand_composites=True)
+    self.assertLen(repacked, 2)
+    self.assertIsInstance(repacked[0], np_arrays.ndarray)
+    self.assertIsInstance(repacked[1], np_arrays.ndarray)
+
+    self.assertAllClose(tensors, repacked)
+
 
 if __name__ == '__main__':
   # TODO(wangpeng): Test in graph mode as well.
diff --git a/tensorflow/python/ops/numpy_ops/np_backprop_test.py b/tensorflow/python/ops/numpy_ops/np_backprop_test.py
deleted file mode 100644
index 65c532153a4..00000000000
--- a/tensorflow/python/ops/numpy_ops/np_backprop_test.py
+++ /dev/null
@@ -1,69 +0,0 @@
-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for backpropgration on tf-numpy functions."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.python.eager import backprop
-from tensorflow.python.framework import ops
-from tensorflow.python.ops.numpy_ops import np_array_ops
-# Required for operator overloads
-from tensorflow.python.ops.numpy_ops import np_math_ops  # pylint: disable=unused-import
-from tensorflow.python.platform import test
-
-
-class BackpropTest(test.TestCase):
-
-  def test_setitem(self):
-    # Single integer index.
-    a = np_array_ops.array([1., 2., 3.])
-    b = np_array_ops.array(5.)
-    c = np_array_ops.array(10.)
-
-    tensors = [arr.data for arr in [a, b, c]]
-    with backprop.GradientTape() as g:
-      g.watch(tensors)
-      a[1] = b + c
-      loss = np_array_ops.sum(a)
-
-    gradients = g.gradient(loss.data, tensors)
-    self.assertSequenceEqual(
-        np_array_ops.array(gradients[0]).tolist(), [1., 0., 1.])
-    self.assertEqual(np_array_ops.array(gradients[1]).tolist(), 1.)
-    self.assertEqual(np_array_ops.array(gradients[2]).tolist(), 1.)
-
-    # Tuple index.
-    a = np_array_ops.array([[[1., 2.], [3., 4.]], [[5., 6.],
-                                                   [7., 8.]]])  # 2x2x2 array.
-    b = np_array_ops.array([10., 11.])
-
-    tensors = [arr.data for arr in [a, b]]
-    with backprop.GradientTape() as g:
-      g.watch(tensors)
-      a[(1, 0)] = b
-      loss = np_array_ops.sum(a)
-
-    gradients = g.gradient(loss.data, tensors)
-    self.assertSequenceEqual(
-        np_array_ops.array(gradients[0]).tolist(),
-        [[[1., 1.], [1., 1.]], [[0., 0.], [1., 1.]]])
-    self.assertEqual(np_array_ops.array(gradients[1]).tolist(), [1., 1.])
-
-
-if __name__ == '__main__':
-  ops.enable_eager_execution()
-  test.main()
diff --git a/tensorflow/python/ops/numpy_ops/np_dtypes.py b/tensorflow/python/ops/numpy_ops/np_dtypes.py
index cefd09f800d..6f335773ab7 100644
--- a/tensorflow/python/ops/numpy_ops/np_dtypes.py
+++ b/tensorflow/python/ops/numpy_ops/np_dtypes.py
@@ -26,33 +26,32 @@ import numpy as np
 # pylint: disable=unused-import
 # pylint: disable=g-bad-import-order
 from numpy import bool_
+from numpy import complex_
+from numpy import complex128
+from numpy import complex64
+from numpy import float_
+from numpy import float16
+from numpy import float32
+from numpy import float64
+from numpy import inexact
 from numpy import int_
 from numpy import int16
 from numpy import int32
 from numpy import int64
 from numpy import int8
+from numpy import object_
+from numpy import string_
 from numpy import uint16
 from numpy import uint32
 from numpy import uint64
 from numpy import uint8
-from numpy import float_
-from numpy import float16
-from numpy import float32
-from numpy import float64
-from numpy import complex_
-from numpy import complex64
-from numpy import complex128
-
-from numpy import inexact
+from numpy import unicode_
 
 from numpy import iinfo
 from numpy import issubdtype
 
 from numpy import inf
 
-# TODO(wangpeng): Make bfloat16 a numpy dtype instead of using TF's
-from tensorflow.python.framework.dtypes import bfloat16
-# pylint: enable=g-bad-import-order
 # pylint: enable=unused-import
 
 _to_float32 = {
diff --git a/tensorflow/python/ops/numpy_ops/np_interop_test.py b/tensorflow/python/ops/numpy_ops/np_interop_test.py
new file mode 100644
index 00000000000..33abb58f260
--- /dev/null
+++ b/tensorflow/python/ops/numpy_ops/np_interop_test.py
@@ -0,0 +1,365 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for interop between TF ops, numpy_ops, and numpy methods."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as onp
+import tensorflow.compat.v2 as tf
+
+import tensorflow.python.ops.numpy_ops as np
+
+
+# Tests for code snippet put in README.md
+class ReadmeTest(tf.test.TestCase):
+
+  def testBroadcastAdd(self):
+    x_np = np.ones([2, 1]) + np.ones([1, 2])
+    x_onp = onp.ones([2, 1]) + onp.ones([1, 2])
+    self.assertAllClose(x_onp, x_np)
+
+  def testTypePromotion(self):
+    x_np = np.ones([1, 2], dtype=np.int16) + np.ones([2, 1], dtype=np.uint8)
+    x_onp = np.ones([1, 2], dtype=np.int16) + np.ones([2, 1], dtype=np.uint8)
+    self.assertEqual(x_onp.dtype, x_np.dtype)
+    self.assertAllClose(x_onp, x_np)
+
+  def testTFInterop(self):
+    x_np = np.sum(np.ones([1, 2]) + tf.ones([2, 1]))
+    x_onp = onp.sum(onp.ones([1, 2]) + onp.ones([2, 1]))
+    self.assertAllClose(x_onp, x_np)
+
+  def testOnpInterop(self):
+    x_np = onp.sum(np.ones([1, 2]) + onp.ones([2, 1]))
+    x_onp = onp.sum(onp.ones([1, 2]) + onp.ones([2, 1]))
+    self.assertAllClose(x_onp, x_np)
+
+  def testDevice(self):
+    if tf.test.is_gpu_available():
+      with tf.device('GPU:0'):
+        x = np.ones([1, 2])
+      self.assertIn('GPU', tf.convert_to_tensor(x).device)
+    with tf.device('CPU:0'):
+      x = np.ones([1, 2])
+    self.assertIn('CPU', tf.convert_to_tensor(x).device)
+
+  def testFunction(self):
+
+    @tf.function
+    def f(x, y):
+      return np.sum(x + y)
+
+    x_np = f(np.ones([1, 2]), tf.ones([2, 1]))
+    x_onp = onp.sum(onp.ones([1, 2]) + onp.ones([2, 1]))
+    self.assertAllClose(x_onp, x_np)
+
+
+class InteropTest(tf.test.TestCase):
+
+  def setUp(self):
+    super(InteropTest, self).setUp()
+    physical_devices = tf.config.list_physical_devices('CPU')
+    configs = tf.config.get_logical_device_configuration(physical_devices[0])
+    if configs is None:
+      logical_devices = [
+          tf.config.LogicalDeviceConfiguration() for _ in range(3)
+      ]
+      tf.config.set_logical_device_configuration(physical_devices[0],
+                                                 logical_devices)
+
+  def testGradientTapeInterop(self):
+    with tf.GradientTape() as t:
+      x = np.asarray(3.0)
+      y = np.asarray(2.0)
+
+      t.watch([x, y])
+
+      xx = 2 * x
+      yy = 3 * y
+
+    dx, dy = t.gradient([xx, yy], [x, y])
+
+    self.assertIsInstance(dx, np.ndarray)
+    self.assertIsInstance(dy, np.ndarray)
+    self.assertAllClose(dx, 2.0)
+    self.assertAllClose(dy, 3.0)
+
+  def testCondInterop(self):
+    x = np.asarray(3.0)
+
+    def fn(x):
+      x_plus_1 = tf.cond(x > 0, lambda: x + 1, lambda: x + 2)
+      x_plus_2 = tf.cond(x < 0, lambda: x + 1, lambda: x + 2)
+
+      return x_plus_1, x_plus_2
+
+    raw_x_plus_1, raw_x_plus_2 = fn(x)
+    fn_x_plus_1, fn_x_plus_2 = tf.function(fn)(x)
+
+    self.assertAllClose(raw_x_plus_1, x + 1)
+    self.assertAllClose(raw_x_plus_2, x + 2)
+
+    self.assertAllClose(fn_x_plus_1, x + 1)
+    self.assertAllClose(fn_x_plus_2, x + 2)
+
+  def testWhileInterop(self):
+
+    def fn():
+      x = np.asarray(0)
+      c = lambda x: x < 10000
+      b = lambda x: [x + 1]
+      return tf.while_loop(c, b, [x], parallel_iterations=20)
+
+    self.assertEqual(10000, fn()[0])
+    self.assertEqual(10000, tf.function(fn)()[0])
+
+  def testTensorTFNPArrayInterop(self):
+    arr = np.asarray(0.)
+    t = tf.constant(10.)
+
+    arr_plus_t = arr + t
+    t_plus_arr = t + arr
+
+    self.assertIsInstance(arr_plus_t, tf.Tensor)
+    self.assertIsInstance(t_plus_arr, tf.Tensor)
+    self.assertEqual(10., arr_plus_t.numpy())
+    self.assertEqual(10., t_plus_arr.numpy())
+
+  def testTensorTFNPOp(self):
+    t = tf.constant(10.)
+
+    sq = np.square(t)
+    self.assertIsInstance(sq, np.ndarray)
+    self.assertEqual(100., sq)
+
+  def testTFNPArrayTFOpInterop(self):
+    arr = np.asarray(10.)
+
+    # TODO(nareshmodi): Test more ops.
+    sq = tf.square(arr)
+    self.assertIsInstance(sq, tf.Tensor)
+    self.assertEqual(100., sq.numpy())
+
+  def testTFNPArrayNPOpInterop(self):
+    arr = np.asarray([10.])
+
+    # TODO(nareshmodi): Test more ops.
+    sq = onp.square(arr)
+    self.assertIsInstance(sq, onp.ndarray)
+    self.assertEqual(100., sq[0])
+
+    # TODO(nareshmodi): Fails since the autopacking code doesn't use
+    # nest.flatten.
+
+
+#   def testAutopacking(self):
+#     arr1 = np.asarray(1.)
+#     arr2 = np.asarray(2.)
+#     arr3 = np.asarray(3.)
+#     t = ops.convert_to_tensor_v2([arr1, arr2, arr3])
+
+#     self.assertEqual(t.numpy(), [1., 2., 3.])
+
+  def testDistStratInterop(self):
+    strategy = tf.distribute.MirroredStrategy(
+        devices=['CPU:0', 'CPU:1', 'CPU:2'])
+
+    multiplier = np.asarray(5.)
+
+    @tf.function
+    def run():
+      ctx = tf.distribute.get_replica_context()
+      val = np.asarray(ctx.replica_id_in_sync_group)
+      return val * multiplier
+
+    distributed_values = strategy.run(run)
+    reduced = strategy.reduce(
+        tf.distribute.ReduceOp.SUM, distributed_values, axis=None)
+
+    values = strategy.experimental_local_results(distributed_values)
+
+    # Note that this should match the number of virtual CPUs.
+    self.assertLen(values, 3)
+    self.assertIsInstance(values[0], np.ndarray)
+    self.assertIsInstance(values[1], np.ndarray)
+    self.assertIsInstance(values[2], np.ndarray)
+    self.assertAllClose(values[0], 0)
+    self.assertAllClose(values[1], 5)
+    self.assertAllClose(values[2], 10)
+
+    # "strategy.reduce" doesn't rewrap in ndarray.
+    # self.assertIsInstance(reduced, np.ndarray)
+    self.assertAllClose(reduced, 15)
+
+  def testPyFuncInterop(self):
+    def py_func_fn(a, b):
+      return a + b
+
+    @tf.function
+    def fn(a, b):
+      result = tf.py_function(py_func_fn, [a, b], a.dtype)
+      return np.asarray(result)
+
+    a = np.asarray(1.)
+    b = np.asarray(2.)
+
+    result = fn(a, b)
+    self.assertIsInstance(result, np.ndarray)
+    self.assertAllClose(result, 3.)
+
+  def testDatasetInterop(self):
+    values = [1, 2, 3, 4, 5, 6]
+    values_as_array = np.asarray(values)
+
+    # Tensor dataset
+    dataset = tf.data.Dataset.from_tensors(values_as_array)
+
+    for value, value_from_dataset in zip([values_as_array], dataset):
+      self.assertIsInstance(value_from_dataset, np.ndarray)
+      self.assertAllEqual(value_from_dataset, value)
+
+    # Tensor slice dataset
+    dataset = tf.data.Dataset.from_tensor_slices(values_as_array)
+
+    for value, value_from_dataset in zip(values, dataset):
+      self.assertIsInstance(value_from_dataset, np.ndarray)
+      self.assertAllEqual(value_from_dataset, value)
+
+    # # TODO(nareshmodi): as_numpy_iterator() doesn't work.
+    # items = list(dataset.as_numpy_iterator())
+
+    # Map over a dataset.
+    dataset = dataset.map(lambda x: np.add(x, 1))
+
+    for value, value_from_dataset in zip(values, dataset):
+      self.assertIsInstance(value_from_dataset, np.ndarray)
+      self.assertAllEqual(value_from_dataset, value + 1)
+
+    # Batch a dataset.
+    dataset = tf.data.Dataset.from_tensor_slices(values_as_array).batch(2)
+
+    for value, value_from_dataset in zip([[1, 2], [3, 4], [5, 6]], dataset):
+      self.assertIsInstance(value_from_dataset, np.ndarray)
+      self.assertAllEqual(value_from_dataset, value)
+
+  def testKerasInterop(self):
+    # Return an ndarray from the model.
+    inputs = tf.keras.layers.Input(shape=(10,))
+    output_layer = tf.keras.layers.Lambda(np.square)(inputs)
+    model = tf.keras.Model([inputs], output_layer)
+
+    values = onp.arange(10, dtype=onp.float32)
+    values_as_array = np.asarray(values)
+
+    result = model(values)
+    self.assertIsInstance(result, np.ndarray)
+    self.assertAllClose(result, onp.square(values))
+
+    result = model(values_as_array)
+    self.assertIsInstance(result, np.ndarray)
+    self.assertAllClose(result, onp.square(values))
+
+  def testPForInterop(self):
+    def outer_product(a):
+      return np.tensordot(a, a, 0)
+
+    batch_size = 100
+    a = np.ones((batch_size, 32, 32))
+    c = tf.vectorized_map(outer_product, a)
+
+    # # TODO(nareshmodi): vectorized_map doesn't rewrap tensors in ndarray.
+    # self.assertIsInstance(c, np.ndarray)
+    self.assertEqual(c.shape, (batch_size, 32, 32, 32, 32))
+
+  def testJacobian(self):
+    with tf.GradientTape() as g:
+      x = np.asarray([1., 2.])
+      y = np.asarray([3., 4.])
+      g.watch(x)
+      g.watch(y)
+      z = x * x * y
+
+    jacobian = g.jacobian(z, [x, y])
+    answer = [tf.linalg.diag(2 * x * y), tf.linalg.diag(x * x)]
+
+    self.assertIsInstance(jacobian[0], np.ndarray)
+    self.assertIsInstance(jacobian[1], np.ndarray)
+    self.assertAllClose(jacobian, answer)
+
+
+class FunctionTest(InteropTest):
+
+  def testFunctionInterop(self):
+    x = np.asarray(3.0)
+    y = np.asarray(2.0)
+
+    add = lambda x, y: x + y
+    add_fn = tf.function(add)
+
+    raw_result = add(x, y)
+    fn_result = add_fn(x, y)
+
+    self.assertIsInstance(raw_result, np.ndarray)
+    self.assertIsInstance(fn_result, np.ndarray)
+    self.assertAllClose(raw_result, fn_result)
+
+  def testLen(self):
+
+    @tf.function
+    def f(x):
+      # Note that shape of input to len is data dependent.
+      return len(np.where(x)[0])
+
+    t = np.asarray([True, False, True])
+    with self.assertRaises(TypeError):
+      f(t)
+
+  def testIter(self):
+
+    @tf.function
+    def f(x):
+      y, z = x
+      return y, z
+
+    with self.assertRaises(TypeError):
+      f(np.asarray([3, 4]))
+
+  def testIndex(self):
+
+    @tf.function
+    def f(x):
+      return [0, 1][x]
+
+    with self.assertRaises(TypeError):
+      f(np.asarray([1]))
+
+
+class VariableTest(InteropTest):
+
+  def test(self):
+    tf_var = tf.Variable(2.0)
+    value = np.square(tf_var)
+    self.assertIsInstance(value, np.ndarray)
+    self.assertAllClose(4.0, value)
+    with tf.control_dependencies([tf_var.assign_add(value)]):
+      tf_var_value = tf_var.read_value()
+    self.assertAllClose(6.0, tf_var_value)
+
+
+if __name__ == '__main__':
+  tf.compat.v1.enable_eager_execution()
+  tf.test.main()
diff --git a/tensorflow/python/ops/numpy_ops/np_math_ops.py b/tensorflow/python/ops/numpy_ops/np_math_ops.py
index b32f78bee5a..03329bbdbf1 100644
--- a/tensorflow/python/ops/numpy_ops/np_math_ops.py
+++ b/tensorflow/python/ops/numpy_ops/np_math_ops.py
@@ -34,13 +34,19 @@ from tensorflow.python.ops import clip_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import sort_ops
+from tensorflow.python.ops import special_math_ops
 from tensorflow.python.ops.numpy_ops import np_array_ops
 from tensorflow.python.ops.numpy_ops import np_arrays
 from tensorflow.python.ops.numpy_ops import np_dtypes
 from tensorflow.python.ops.numpy_ops import np_utils
 
 
-@np_utils.np_doc_only(np.dot)
+pi = np.pi
+e = np.e
+inf = np.inf
+
+
+@np_utils.np_doc_only('dot')
 def dot(a, b):  # pylint: disable=missing-docstring
 
   def f(a, b):  # pylint: disable=missing-docstring
@@ -67,7 +73,7 @@ def _bin_op(tf_fun, a, b, promote=True):
   return np_utils.tensor_to_ndarray(tf_fun(a.data, b.data))
 
 
-@np_utils.np_doc(np.add)
+@np_utils.np_doc('add')
 def add(x1, x2):
 
   def add_or_or(x1, x2):
@@ -79,12 +85,12 @@ def add(x1, x2):
   return _bin_op(add_or_or, x1, x2)
 
 
-@np_utils.np_doc(np.subtract)
+@np_utils.np_doc('subtract')
 def subtract(x1, x2):
   return _bin_op(math_ops.subtract, x1, x2)
 
 
-@np_utils.np_doc(np.multiply)
+@np_utils.np_doc('multiply')
 def multiply(x1, x2):
 
   def mul_or_and(x1, x2):
@@ -96,7 +102,7 @@ def multiply(x1, x2):
   return _bin_op(mul_or_and, x1, x2)
 
 
-@np_utils.np_doc(np.true_divide)
+@np_utils.np_doc('true_divide')
 def true_divide(x1, x2):  # pylint: disable=missing-function-docstring
 
   def _avoid_float64(x1, x2):
@@ -123,7 +129,7 @@ def true_divide(x1, x2):  # pylint: disable=missing-function-docstring
 divide = true_divide
 
 
-@np_utils.np_doc(np.floor_divide)
+@np_utils.np_doc('floor_divide')
 def floor_divide(x1, x2):  # pylint: disable=missing-function-docstring
 
   def f(x1, x2):
@@ -136,7 +142,7 @@ def floor_divide(x1, x2):  # pylint: disable=missing-function-docstring
   return _bin_op(f, x1, x2)
 
 
-@np_utils.np_doc(np.mod)
+@np_utils.np_doc('mod')
 def mod(x1, x2):  # pylint: disable=missing-function-docstring
 
   def f(x1, x2):
@@ -152,12 +158,12 @@ def mod(x1, x2):  # pylint: disable=missing-function-docstring
 remainder = mod
 
 
-@np_utils.np_doc(np.divmod)
+@np_utils.np_doc('divmod')
 def divmod(x1, x2):  # pylint: disable=redefined-builtin
   return floor_divide(x1, x2), mod(x1, x2)
 
 
-@np_utils.np_doc(np.maximum)
+@np_utils.np_doc('maximum')
 def maximum(x1, x2):
 
   def max_or_or(x1, x2):
@@ -169,7 +175,7 @@ def maximum(x1, x2):
   return _bin_op(max_or_or, x1, x2)
 
 
-@np_utils.np_doc(np.minimum)
+@np_utils.np_doc('minimum')
 def minimum(x1, x2):
 
   def min_or_and(x1, x2):
@@ -181,7 +187,7 @@ def minimum(x1, x2):
   return _bin_op(min_or_and, x1, x2)
 
 
-@np_utils.np_doc(np.clip)
+@np_utils.np_doc('clip')
 def clip(a, a_min, a_max):  # pylint: disable=missing-docstring
   if a_min is None and a_max is None:
     raise ValueError('Not more than one of `a_min` and `a_max` may be `None`.')
@@ -196,7 +202,7 @@ def clip(a, a_min, a_max):  # pylint: disable=missing-docstring
             *np_utils.tf_broadcast(a.data, a_min.data, a_max.data)))
 
 
-@np_utils.np_doc(np.matmul)
+@np_utils.np_doc('matmul')
 def matmul(x1, x2):  # pylint: disable=missing-docstring
 
   def f(x1, x2):
@@ -215,12 +221,12 @@ def matmul(x1, x2):  # pylint: disable=missing-docstring
   return _bin_op(f, x1, x2)
 
 
-@np_utils.np_doc(np.tensordot)
+@np_utils.np_doc('tensordot')
 def tensordot(a, b, axes=2):
   return _bin_op(lambda a, b: math_ops.tensordot(a, b, axes=axes), a, b)
 
 
-@np_utils.np_doc_only(np.inner)
+@np_utils.np_doc_only('inner')
 def inner(a, b):  # pylint: disable=missing-function-docstring
 
   def f(a, b):
@@ -233,7 +239,7 @@ def inner(a, b):  # pylint: disable=missing-function-docstring
   return _bin_op(f, a, b)
 
 
-@np_utils.np_doc(np.cross)
+@np_utils.np_doc('cross')
 def cross(a, b, axisa=-1, axisb=-1, axisc=-1, axis=None):  # pylint: disable=missing-docstring
 
   def f(a, b):  # pylint: disable=missing-docstring
@@ -309,27 +315,37 @@ def cross(a, b, axisa=-1, axisb=-1, axisc=-1, axis=None):  # pylint: disable=mis
   return _bin_op(f, a, b)
 
 
-@np_utils.np_doc(np.power)
+@np_utils.np_doc_only('vdot')
+def vdot(a, b):  # pylint: disable=missing-docstring
+  a, b = np_array_ops._promote_dtype(a, b)
+  a = np_array_ops.reshape(a, [-1])
+  b = np_array_ops.reshape(b, [-1])
+  if a.dtype == np_dtypes.complex128 or a.dtype == np_dtypes.complex64:
+    a = conj(a)
+  return dot(a, b)
+
+
+@np_utils.np_doc('power')
 def power(x1, x2):
   return _bin_op(math_ops.pow, x1, x2)
 
 
-@np_utils.np_doc(np.float_power)
+@np_utils.np_doc('float_power')
 def float_power(x1, x2):
   return power(x1, x2)
 
 
-@np_utils.np_doc(np.arctan2)
+@np_utils.np_doc('arctan2')
 def arctan2(x1, x2):
   return _bin_op(math_ops.atan2, x1, x2)
 
 
-@np_utils.np_doc(np.nextafter)
+@np_utils.np_doc('nextafter')
 def nextafter(x1, x2):
   return _bin_op(math_ops.nextafter, x1, x2)
 
 
-@np_utils.np_doc(np.heaviside)
+@np_utils.np_doc('heaviside')
 def heaviside(x1, x2):  # pylint: disable=missing-function-docstring
 
   def f(x1, x2):
@@ -343,12 +359,12 @@ def heaviside(x1, x2):  # pylint: disable=missing-function-docstring
   return y
 
 
-@np_utils.np_doc(np.hypot)
+@np_utils.np_doc('hypot')
 def hypot(x1, x2):
   return sqrt(square(x1) + square(x2))
 
 
-@np_utils.np_doc(np.kron)
+@np_utils.np_doc('kron')
 def kron(a, b):  # pylint: disable=missing-function-docstring
   # pylint: disable=protected-access,g-complex-comprehension
   a, b = np_array_ops._promote_dtype(a, b)
@@ -379,7 +395,7 @@ def kron(a, b):  # pylint: disable=missing-function-docstring
   return np_array_ops.reshape(a_reshaped * b_reshaped, out_shape)
 
 
-@np_utils.np_doc(np.outer)
+@np_utils.np_doc('outer')
 def outer(a, b):
 
   def f(a, b):
@@ -389,7 +405,7 @@ def outer(a, b):
 
 
 # This can also be implemented via tf.reduce_logsumexp
-@np_utils.np_doc(np.logaddexp)
+@np_utils.np_doc('logaddexp')
 def logaddexp(x1, x2):
   amax = maximum(x1, x2)
   delta = x1 - x2
@@ -399,7 +415,7 @@ def logaddexp(x1, x2):
       amax + log1p(exp(-abs(delta))))
 
 
-@np_utils.np_doc(np.logaddexp2)
+@np_utils.np_doc('logaddexp2')
 def logaddexp2(x1, x2):
   amax = maximum(x1, x2)
   delta = x1 - x2
@@ -409,7 +425,7 @@ def logaddexp2(x1, x2):
       amax + log1p(exp2(-abs(delta))) / np.log(2))
 
 
-@np_utils.np_doc(np.polyval)
+@np_utils.np_doc('polyval')
 def polyval(p, x):  # pylint: disable=missing-function-docstring
 
   def f(p, x):
@@ -427,7 +443,7 @@ def polyval(p, x):  # pylint: disable=missing-function-docstring
   return _bin_op(f, p, x)
 
 
-@np_utils.np_doc(np.isclose)
+@np_utils.np_doc('isclose')
 def isclose(a, b, rtol=1e-05, atol=1e-08, equal_nan=False):  # pylint: disable=missing-docstring
 
   def f(a, b):  # pylint: disable=missing-docstring
@@ -445,7 +461,7 @@ def isclose(a, b, rtol=1e-05, atol=1e-08, equal_nan=False):  # pylint: disable=m
   return _bin_op(f, a, b)
 
 
-@np_utils.np_doc(np.allclose)
+@np_utils.np_doc('allclose')
 def allclose(a, b, rtol=1e-05, atol=1e-08, equal_nan=False):
   return np_array_ops.all(
       isclose(a, b, rtol=rtol, atol=atol, equal_nan=equal_nan))
@@ -480,13 +496,13 @@ def _tf_gcd(x1, x2):  # pylint: disable=missing-function-docstring
 
 
 # Note that np.gcd may not be present in some supported versions of numpy.
-@np_utils.np_doc(None, np_fun_name='gcd')
+@np_utils.np_doc('gcd')
 def gcd(x1, x2):
   return _bin_op(_tf_gcd, x1, x2)
 
 
 # Note that np.lcm may not be present in some supported versions of numpy.
-@np_utils.np_doc(None, np_fun_name='lcm')
+@np_utils.np_doc('lcm')
 def lcm(x1, x2):  # pylint: disable=missing-function-docstring
 
   def f(x1, x2):
@@ -517,22 +533,22 @@ def _bitwise_binary_op(tf_fn, x1, x2):  # pylint: disable=missing-function-docst
   return _bin_op(f, x1, x2)
 
 
-@np_utils.np_doc(np.bitwise_and)
+@np_utils.np_doc('bitwise_and')
 def bitwise_and(x1, x2):
   return _bitwise_binary_op(bitwise_ops.bitwise_and, x1, x2)
 
 
-@np_utils.np_doc(np.bitwise_or)
+@np_utils.np_doc('bitwise_or')
 def bitwise_or(x1, x2):
   return _bitwise_binary_op(bitwise_ops.bitwise_or, x1, x2)
 
 
-@np_utils.np_doc(np.bitwise_xor)
+@np_utils.np_doc('bitwise_xor')
 def bitwise_xor(x1, x2):
   return _bitwise_binary_op(bitwise_ops.bitwise_xor, x1, x2)
 
 
-@np_utils.np_doc(np.bitwise_not)
+@np_utils.np_doc('bitwise_not')
 def bitwise_not(x):
 
   def f(x):
@@ -564,62 +580,62 @@ def _scalar(tf_fn, x, promote_to_float=False):
   return np_utils.tensor_to_ndarray(tf_fn(x.data))
 
 
-@np_utils.np_doc(np.log)
+@np_utils.np_doc('log')
 def log(x):
   return _scalar(math_ops.log, x, True)
 
 
-@np_utils.np_doc(np.exp)
+@np_utils.np_doc('exp')
 def exp(x):
   return _scalar(math_ops.exp, x, True)
 
 
-@np_utils.np_doc(np.sqrt)
+@np_utils.np_doc('sqrt')
 def sqrt(x):
   return _scalar(math_ops.sqrt, x, True)
 
 
-@np_utils.np_doc(np.abs)
+@np_utils.np_doc('abs')
 def abs(x):  # pylint: disable=redefined-builtin
   return _scalar(math_ops.abs, x)
 
 
-@np_utils.np_doc(np.absolute)
+@np_utils.np_doc('absolute')
 def absolute(x):
   return abs(x)
 
 
-@np_utils.np_doc(np.fabs)
+@np_utils.np_doc('fabs')
 def fabs(x):
   return abs(x)
 
 
-@np_utils.np_doc(np.ceil)
+@np_utils.np_doc('ceil')
 def ceil(x):
   return _scalar(math_ops.ceil, x, True)
 
 
-@np_utils.np_doc(np.floor)
+@np_utils.np_doc('floor')
 def floor(x):
   return _scalar(math_ops.floor, x, True)
 
 
-@np_utils.np_doc(np.conj)
+@np_utils.np_doc('conj')
 def conj(x):
   return _scalar(math_ops.conj, x)
 
 
-@np_utils.np_doc(np.negative)
+@np_utils.np_doc('negative')
 def negative(x):
   return _scalar(math_ops.negative, x)
 
 
-@np_utils.np_doc(np.reciprocal)
+@np_utils.np_doc('reciprocal')
 def reciprocal(x):
   return _scalar(math_ops.reciprocal, x)
 
 
-@np_utils.np_doc(np.signbit)
+@np_utils.np_doc('signbit')
 def signbit(x):
 
   def f(x):
@@ -630,67 +646,67 @@ def signbit(x):
   return _scalar(f, x)
 
 
-@np_utils.np_doc(np.sin)
+@np_utils.np_doc('sin')
 def sin(x):
   return _scalar(math_ops.sin, x, True)
 
 
-@np_utils.np_doc(np.cos)
+@np_utils.np_doc('cos')
 def cos(x):
   return _scalar(math_ops.cos, x, True)
 
 
-@np_utils.np_doc(np.tan)
+@np_utils.np_doc('tan')
 def tan(x):
   return _scalar(math_ops.tan, x, True)
 
 
-@np_utils.np_doc(np.sinh)
+@np_utils.np_doc('sinh')
 def sinh(x):
   return _scalar(math_ops.sinh, x, True)
 
 
-@np_utils.np_doc(np.cosh)
+@np_utils.np_doc('cosh')
 def cosh(x):
   return _scalar(math_ops.cosh, x, True)
 
 
-@np_utils.np_doc(np.tanh)
+@np_utils.np_doc('tanh')
 def tanh(x):
   return _scalar(math_ops.tanh, x, True)
 
 
-@np_utils.np_doc(np.arcsin)
+@np_utils.np_doc('arcsin')
 def arcsin(x):
   return _scalar(math_ops.asin, x, True)
 
 
-@np_utils.np_doc(np.arccos)
+@np_utils.np_doc('arccos')
 def arccos(x):
   return _scalar(math_ops.acos, x, True)
 
 
-@np_utils.np_doc(np.arctan)
+@np_utils.np_doc('arctan')
 def arctan(x):
   return _scalar(math_ops.atan, x, True)
 
 
-@np_utils.np_doc(np.arcsinh)
+@np_utils.np_doc('arcsinh')
 def arcsinh(x):
   return _scalar(math_ops.asinh, x, True)
 
 
-@np_utils.np_doc(np.arccosh)
+@np_utils.np_doc('arccosh')
 def arccosh(x):
   return _scalar(math_ops.acosh, x, True)
 
 
-@np_utils.np_doc(np.arctanh)
+@np_utils.np_doc('arctanh')
 def arctanh(x):
   return _scalar(math_ops.atanh, x, True)
 
 
-@np_utils.np_doc(np.deg2rad)
+@np_utils.np_doc('deg2rad')
 def deg2rad(x):
 
   def f(x):
@@ -699,7 +715,7 @@ def deg2rad(x):
   return _scalar(f, x, True)
 
 
-@np_utils.np_doc(np.rad2deg)
+@np_utils.np_doc('rad2deg')
 def rad2deg(x):
   return x * (180.0 / np.pi)
 
@@ -709,7 +725,7 @@ _tf_float_types = [
 ]
 
 
-@np_utils.np_doc(np.angle)
+@np_utils.np_doc('angle')
 def angle(z, deg=False):  # pylint: disable=missing-function-docstring
 
   def f(x):
@@ -725,7 +741,7 @@ def angle(z, deg=False):  # pylint: disable=missing-function-docstring
   return y
 
 
-@np_utils.np_doc(np.cbrt)
+@np_utils.np_doc('cbrt')
 def cbrt(x):
 
   def f(x):
@@ -736,12 +752,12 @@ def cbrt(x):
   return _scalar(f, x, True)
 
 
-@np_utils.np_doc(np.conjugate)
+@np_utils.np_doc('conjugate')
 def conjugate(x):
   return _scalar(math_ops.conj, x)
 
 
-@np_utils.np_doc(np.exp2)
+@np_utils.np_doc('exp2')
 def exp2(x):
 
   def f(x):
@@ -750,12 +766,12 @@ def exp2(x):
   return _scalar(f, x, True)
 
 
-@np_utils.np_doc(np.expm1)
+@np_utils.np_doc('expm1')
 def expm1(x):
   return _scalar(math_ops.expm1, x, True)
 
 
-@np_utils.np_doc(np.fix)
+@np_utils.np_doc('fix')
 def fix(x):
 
   def f(x):
@@ -764,36 +780,36 @@ def fix(x):
   return _scalar(f, x, True)
 
 
-@np_utils.np_doc(np.iscomplex)
+@np_utils.np_doc('iscomplex')
 def iscomplex(x):
   return np_array_ops.imag(x) != 0
 
 
-@np_utils.np_doc(np.isreal)
+@np_utils.np_doc('isreal')
 def isreal(x):
   return np_array_ops.imag(x) == 0
 
 
-@np_utils.np_doc(np.iscomplexobj)
+@np_utils.np_doc('iscomplexobj')
 def iscomplexobj(x):
   x = np_array_ops.array(x)
   return np.issubdtype(x.dtype, np.complexfloating)
 
 
-@np_utils.np_doc(np.isrealobj)
+@np_utils.np_doc('isrealobj')
 def isrealobj(x):
   return not iscomplexobj(x)
 
 
-@np_utils.np_doc(np.isnan)
+@np_utils.np_doc('isnan')
 def isnan(x):
   return _scalar(math_ops.is_nan, x, True)
 
 
-def _make_nan_reduction(onp_reduction, reduction, init_val):
+def _make_nan_reduction(np_fun_name, reduction, init_val):
   """Helper to generate nan* functions."""
 
-  @np_utils.np_doc(onp_reduction)
+  @np_utils.np_doc(np_fun_name)
   def nan_reduction(a, axis=None, dtype=None, keepdims=False):
     a = np_array_ops.array(a)
     v = np_array_ops.array(init_val, dtype=a.dtype)
@@ -806,11 +822,11 @@ def _make_nan_reduction(onp_reduction, reduction, init_val):
   return nan_reduction
 
 
-nansum = _make_nan_reduction(np.nansum, np_array_ops.sum, 0)
-nanprod = _make_nan_reduction(np.nanprod, np_array_ops.prod, 1)
+nansum = _make_nan_reduction('nansum', np_array_ops.sum, 0)
+nanprod = _make_nan_reduction('nanprod', np_array_ops.prod, 1)
 
 
-@np_utils.np_doc(np.nanmean)
+@np_utils.np_doc('nanmean')
 def nanmean(a, axis=None, dtype=None, keepdims=None):  # pylint: disable=missing-docstring
   a = np_array_ops.array(a)
   if np.issubdtype(a.dtype, np.bool_) or np.issubdtype(a.dtype, np.integer):
@@ -823,47 +839,47 @@ def nanmean(a, axis=None, dtype=None, keepdims=None):  # pylint: disable=missing
   return nansum(a, axis=axis, dtype=dtype, keepdims=keepdims) / normalizer
 
 
-@np_utils.np_doc(np.isfinite)
+@np_utils.np_doc('isfinite')
 def isfinite(x):
   return _scalar(math_ops.is_finite, x, True)
 
 
-@np_utils.np_doc(np.isinf)
+@np_utils.np_doc('isinf')
 def isinf(x):
   return _scalar(math_ops.is_inf, x, True)
 
 
-@np_utils.np_doc(np.isneginf)
+@np_utils.np_doc('isneginf')
 def isneginf(x):
   return x == np_array_ops.full_like(x, -np.inf)
 
 
-@np_utils.np_doc(np.isposinf)
+@np_utils.np_doc('isposinf')
 def isposinf(x):
   return x == np_array_ops.full_like(x, np.inf)
 
 
-@np_utils.np_doc(np.log2)
+@np_utils.np_doc('log2')
 def log2(x):
   return log(x) / np.log(2)
 
 
-@np_utils.np_doc(np.log10)
+@np_utils.np_doc('log10')
 def log10(x):
   return log(x) / np.log(10)
 
 
-@np_utils.np_doc(np.log1p)
+@np_utils.np_doc('log1p')
 def log1p(x):
   return _scalar(math_ops.log1p, x, True)
 
 
-@np_utils.np_doc(np.positive)
+@np_utils.np_doc('positive')
 def positive(x):
   return _scalar(lambda x: x, x)
 
 
-@np_utils.np_doc(np.sinc)
+@np_utils.np_doc('sinc')
 def sinc(x):
 
   def f(x):
@@ -874,12 +890,12 @@ def sinc(x):
   return _scalar(f, x, True)
 
 
-@np_utils.np_doc(np.square)
+@np_utils.np_doc('square')
 def square(x):
   return _scalar(math_ops.square, x)
 
 
-@np_utils.np_doc(np.diff)
+@np_utils.np_doc('diff')
 def diff(a, n=1, axis=-1):  # pylint: disable=missing-function-docstring
 
   def f(a):
@@ -907,29 +923,39 @@ def diff(a, n=1, axis=-1):  # pylint: disable=missing-function-docstring
   return _scalar(f, a)
 
 
-def _flip_args(f):
+def _wrap(f, reverse=False):
+  """Wraps binary ops so they can be added as operator overloads on ndarray."""
 
   def _f(a, b):
-    return f(b, a)
+    if reverse:
+      a, b = b, a
+
+    if getattr(b, '__array_priority__',
+               0) > np_arrays.ndarray.__array_priority__:
+      return NotImplemented
+
+    return f(a, b)
 
   return _f
 
 
 setattr(np_arrays.ndarray, '__abs__', absolute)
-setattr(np_arrays.ndarray, '__floordiv__', floor_divide)
-setattr(np_arrays.ndarray, '__rfloordiv__', _flip_args(floor_divide))
-setattr(np_arrays.ndarray, '__mod__', mod)
-setattr(np_arrays.ndarray, '__rmod__', _flip_args(mod))
-setattr(np_arrays.ndarray, '__add__', add)
-setattr(np_arrays.ndarray, '__radd__', _flip_args(add))
-setattr(np_arrays.ndarray, '__sub__', subtract)
-setattr(np_arrays.ndarray, '__rsub__', _flip_args(subtract))
-setattr(np_arrays.ndarray, '__mul__', multiply)
-setattr(np_arrays.ndarray, '__rmul__', _flip_args(multiply))
-setattr(np_arrays.ndarray, '__pow__', power)
-setattr(np_arrays.ndarray, '__rpow__', _flip_args(power))
-setattr(np_arrays.ndarray, '__truediv__', true_divide)
-setattr(np_arrays.ndarray, '__rtruediv__', _flip_args(true_divide))
+setattr(np_arrays.ndarray, '__floordiv__', _wrap(floor_divide))
+setattr(np_arrays.ndarray, '__rfloordiv__', _wrap(floor_divide, True))
+setattr(np_arrays.ndarray, '__mod__', _wrap(mod))
+setattr(np_arrays.ndarray, '__rmod__', _wrap(mod, True))
+setattr(np_arrays.ndarray, '__add__', _wrap(add))
+setattr(np_arrays.ndarray, '__radd__', _wrap(add, True))
+setattr(np_arrays.ndarray, '__sub__', _wrap(subtract))
+setattr(np_arrays.ndarray, '__rsub__', _wrap(subtract, True))
+setattr(np_arrays.ndarray, '__mul__', _wrap(multiply))
+setattr(np_arrays.ndarray, '__rmul__', _wrap(multiply, True))
+setattr(np_arrays.ndarray, '__matmul__', _wrap(matmul))
+setattr(np_arrays.ndarray, '__rmatmul__', _wrap(matmul, True))
+setattr(np_arrays.ndarray, '__pow__', _wrap(power))
+setattr(np_arrays.ndarray, '__rpow__', _wrap(power, True))
+setattr(np_arrays.ndarray, '__truediv__', _wrap(true_divide))
+setattr(np_arrays.ndarray, '__rtruediv__', _wrap(true_divide, True))
 
 
 def _comparison(tf_fun, x1, x2, cast_bool_to_int=False):
@@ -946,37 +972,37 @@ def _comparison(tf_fun, x1, x2, cast_bool_to_int=False):
   return np_utils.tensor_to_ndarray(tf_fun(x1, x2))
 
 
-@np_utils.np_doc(np.equal)
+@np_utils.np_doc('equal')
 def equal(x1, x2):
   return _comparison(math_ops.equal, x1, x2)
 
 
-@np_utils.np_doc(np.not_equal)
+@np_utils.np_doc('not_equal')
 def not_equal(x1, x2):
   return _comparison(math_ops.not_equal, x1, x2)
 
 
-@np_utils.np_doc(np.greater)
+@np_utils.np_doc('greater')
 def greater(x1, x2):
   return _comparison(math_ops.greater, x1, x2, True)
 
 
-@np_utils.np_doc(np.greater_equal)
+@np_utils.np_doc('greater_equal')
 def greater_equal(x1, x2):
   return _comparison(math_ops.greater_equal, x1, x2, True)
 
 
-@np_utils.np_doc(np.less)
+@np_utils.np_doc('less')
 def less(x1, x2):
   return _comparison(math_ops.less, x1, x2, True)
 
 
-@np_utils.np_doc(np.less_equal)
+@np_utils.np_doc('less_equal')
 def less_equal(x1, x2):
   return _comparison(math_ops.less_equal, x1, x2, True)
 
 
-@np_utils.np_doc(np.array_equal)
+@np_utils.np_doc('array_equal')
 def array_equal(a1, a2):  # pylint: disable=missing-function-docstring
 
   def f(x1, x2):
@@ -999,37 +1025,37 @@ def _logical_binary_op(tf_fun, x1, x2):
   return np_utils.tensor_to_ndarray(tf_fun(x1.data, x2.data))
 
 
-@np_utils.np_doc(np.logical_and)
+@np_utils.np_doc('logical_and')
 def logical_and(x1, x2):
   return _logical_binary_op(math_ops.logical_and, x1, x2)
 
 
-@np_utils.np_doc(np.logical_or)
+@np_utils.np_doc('logical_or')
 def logical_or(x1, x2):
   return _logical_binary_op(math_ops.logical_or, x1, x2)
 
 
-@np_utils.np_doc(np.logical_xor)
+@np_utils.np_doc('logical_xor')
 def logical_xor(x1, x2):
   return _logical_binary_op(math_ops.logical_xor, x1, x2)
 
 
-@np_utils.np_doc(np.logical_not)
+@np_utils.np_doc('logical_not')
 def logical_not(x):
   x = np_array_ops.array(x, dtype=np.bool_)
   return np_utils.tensor_to_ndarray(math_ops.logical_not(x.data))
 
 
 setattr(np_arrays.ndarray, '__invert__', logical_not)
-setattr(np_arrays.ndarray, '__lt__', less)
-setattr(np_arrays.ndarray, '__le__', less_equal)
-setattr(np_arrays.ndarray, '__gt__', greater)
-setattr(np_arrays.ndarray, '__ge__', greater_equal)
-setattr(np_arrays.ndarray, '__eq__', equal)
-setattr(np_arrays.ndarray, '__ne__', not_equal)
+setattr(np_arrays.ndarray, '__lt__', _wrap(less))
+setattr(np_arrays.ndarray, '__le__', _wrap(less_equal))
+setattr(np_arrays.ndarray, '__gt__', _wrap(greater))
+setattr(np_arrays.ndarray, '__ge__', _wrap(greater_equal))
+setattr(np_arrays.ndarray, '__eq__', _wrap(equal))
+setattr(np_arrays.ndarray, '__ne__', _wrap(not_equal))
 
 
-@np_utils.np_doc(np.linspace)
+@np_utils.np_doc('linspace')
 def linspace(  # pylint: disable=missing-docstring
     start,
     stop,
@@ -1068,7 +1094,7 @@ def linspace(  # pylint: disable=missing-docstring
     return np_arrays.tensor_to_ndarray(result)
 
 
-@np_utils.np_doc(np.logspace)
+@np_utils.np_doc('logspace')
 def logspace(start, stop, num=50, endpoint=True, base=10.0, dtype=None, axis=0):
   dtype = np_utils.result_type(start, stop, dtype)
   result = linspace(
@@ -1079,7 +1105,7 @@ def logspace(start, stop, num=50, endpoint=True, base=10.0, dtype=None, axis=0):
   return np_arrays.tensor_to_ndarray(result)
 
 
-@np_utils.np_doc(np.geomspace)
+@np_utils.np_doc('geomspace')
 def geomspace(start, stop, num=50, endpoint=True, dtype=None, axis=0):  # pylint: disable=missing-docstring
   dtype = dtype or np_utils.result_type(start, stop, float(num),
                                         np_array_ops.zeros((), dtype))
@@ -1103,13 +1129,13 @@ def geomspace(start, stop, num=50, endpoint=True, dtype=None, axis=0):  # pylint
   return np_utils.tensor_to_ndarray(math_ops.cast(res, dtype))
 
 
-@np_utils.np_doc(np.ptp)
+@np_utils.np_doc('ptp')
 def ptp(a, axis=None, keepdims=None):
   return (np_array_ops.amax(a, axis=axis, keepdims=keepdims) -
           np_array_ops.amin(a, axis=axis, keepdims=keepdims))
 
 
-@np_utils.np_doc_only(np.concatenate)
+@np_utils.np_doc_only('concatenate')
 def concatenate(arys, axis=0):
   if not isinstance(arys, (list, tuple)):
     arys = [arys]
@@ -1120,7 +1146,7 @@ def concatenate(arys, axis=0):
   return np_arrays.tensor_to_ndarray(array_ops.concat(arys, axis))
 
 
-@np_utils.np_doc_only(np.tile)
+@np_utils.np_doc_only('tile')
 def tile(a, reps):  # pylint: disable=missing-function-docstring
   a = np_array_ops.array(a).data
   reps = np_array_ops.array(reps, dtype=dtypes.int32).reshape([-1]).data
@@ -1137,13 +1163,13 @@ def tile(a, reps):  # pylint: disable=missing-function-docstring
   return np_arrays.tensor_to_ndarray(array_ops.tile(a, reps))
 
 
-@np_utils.np_doc(np.count_nonzero)
+@np_utils.np_doc('count_nonzero')
 def count_nonzero(a, axis=None):
   return np_arrays.tensor_to_ndarray(
       math_ops.count_nonzero(np_array_ops.array(a).data, axis))
 
 
-@np_utils.np_doc(np.argsort)
+@np_utils.np_doc('argsort')
 def argsort(a, axis=-1, kind='quicksort', order=None):  # pylint: disable=missing-docstring
   # TODO(nareshmodi): make string tensors also work.
   if kind not in ('quicksort', 'stable'):
@@ -1168,7 +1194,7 @@ def argsort(a, axis=-1, kind='quicksort', order=None):  # pylint: disable=missin
   return np_array_ops.array(tf_ans, dtype=np.intp)
 
 
-@np_utils.np_doc(np.sort)
+@np_utils.np_doc('sort')
 def sort(a, axis=-1, kind='quicksort', order=None):  # pylint: disable=missing-docstring
   if kind != 'quicksort':
     raise ValueError("Only 'quicksort' is supported.")
@@ -1194,17 +1220,17 @@ def _argminmax(fn, a, axis=None):
   return np_utils.tensor_to_ndarray(fn(input=a_t, axis=axis))
 
 
-@np_utils.np_doc(np.argmax)
+@np_utils.np_doc('argmax')
 def argmax(a, axis=None):
   return _argminmax(math_ops.argmax, a, axis)
 
 
-@np_utils.np_doc(np.argmin)
+@np_utils.np_doc('argmin')
 def argmin(a, axis=None):
   return _argminmax(math_ops.argmin, a, axis)
 
 
-@np_utils.np_doc(np.append)
+@np_utils.np_doc('append')
 def append(arr, values, axis=None):
   if axis is None:
     return concatenate([np_array_ops.ravel(arr), np_array_ops.ravel(values)], 0)
@@ -1212,7 +1238,7 @@ def append(arr, values, axis=None):
     return concatenate([arr, values], axis=axis)
 
 
-@np_utils.np_doc(np.average)
+@np_utils.np_doc('average')
 def average(a, axis=None, weights=None, returned=False):  # pylint: disable=missing-docstring
   if axis is not None and not isinstance(axis, six.integer_types):
     # TODO(wangpeng): Support tuple of ints as `axis`
@@ -1275,7 +1301,7 @@ def average(a, axis=None, weights=None, returned=False):  # pylint: disable=miss
   return avg
 
 
-@np_utils.np_doc(np.trace)
+@np_utils.np_doc('trace')
 def trace(a, offset=0, axis1=0, axis2=1, dtype=None):  # pylint: disable=missing-docstring
   if dtype:
     dtype = np_utils.result_type(dtype)
@@ -1293,7 +1319,7 @@ def trace(a, offset=0, axis1=0, axis2=1, dtype=None):  # pylint: disable=missing
   return np_array_ops.sum(a, -1, dtype)
 
 
-@np_utils.np_doc(np.meshgrid)
+@np_utils.np_doc('meshgrid')
 def meshgrid(*xi, **kwargs):
   """This currently requires copy=True and sparse=False."""
   sparse = kwargs.get('sparse', False)
@@ -1313,3 +1339,32 @@ def meshgrid(*xi, **kwargs):
   outputs = [np_utils.tensor_to_ndarray(output) for output in outputs]
 
   return outputs
+
+
+@np_utils.np_doc('einsum')
+def einsum(subscripts, *operands, **kwargs):  # pylint: disable=missing-docstring
+  casting = kwargs.get('casting', 'safe')
+  optimize = kwargs.get('optimize', False)
+  if casting == 'safe':
+    operands = np_array_ops._promote_dtype(*operands)  # pylint: disable=protected-access
+  elif casting == 'no':
+    operands = [np_array_ops.asarray(x) for x in operands]
+  else:
+    raise ValueError('casting policy not supported: %s' % casting)
+  if not optimize:
+    # TF doesn't have a "no optimization" option.
+    # TODO(wangpeng): Print a warning that np and tf use different
+    #   optimizations.
+    tf_optimize = 'greedy'
+  elif optimize == True:  # pylint: disable=singleton-comparison,g-explicit-bool-comparison
+    tf_optimize = 'greedy'
+  elif optimize == 'greedy':
+    tf_optimize = 'greedy'
+  elif optimize == 'optimal':
+    tf_optimize = 'optimal'
+  else:
+    raise ValueError('`optimize` method not supported: %s' % optimize)
+  operands = [x.data for x in operands]
+  res = special_math_ops.einsum(subscripts, *operands, optimize=tf_optimize)
+  res = np_utils.tensor_to_ndarray(res)
+  return res
diff --git a/tensorflow/python/ops/numpy_ops/np_math_ops_test.py b/tensorflow/python/ops/numpy_ops/np_math_ops_test.py
index a3dfbb6c871..cb5326bcded 100644
--- a/tensorflow/python/ops/numpy_ops/np_math_ops_test.py
+++ b/tensorflow/python/ops/numpy_ops/np_math_ops_test.py
@@ -124,6 +124,12 @@ class MathTest(test.TestCase, parameterized.TestCase):
       np_math_ops.matmul(
           np_array_ops.ones([2, 3], np.int32), np_array_ops.ones([], np.int32))
 
+  def testVDot(self):
+    operands = [([[1, 2], [3, 4]], [[3, 4], [6, 7]]),
+                ([[1, 2], [3, 4]], [3, 4, 6, 7])]
+    return self._testBinaryOp(
+        np_math_ops.vdot, np.vdot, 'vdot', operands=operands)
+
   def _testUnaryOp(self, math_fun, np_fun, name):
 
     def run_test(a):
diff --git a/tensorflow/python/ops/numpy_ops/np_random.py b/tensorflow/python/ops/numpy_ops/np_random.py
index 801a7549b97..a7556dbddd4 100644
--- a/tensorflow/python/ops/numpy_ops/np_random.py
+++ b/tensorflow/python/ops/numpy_ops/np_random.py
@@ -13,19 +13,44 @@
 # limitations under the License.
 # ==============================================================================
 """Random functions."""
+
+# pylint: disable=g-direct-tensorflow-import
+
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import numpy as np
+import numpy as onp
 
 from tensorflow.python.framework import random_seed
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import random_ops
+from tensorflow.python.ops.numpy_ops import np_array_ops
+from tensorflow.python.ops.numpy_ops import np_dtypes
 from tensorflow.python.ops.numpy_ops import np_utils
 
-DEFAULT_RANDN_DTYPE = np.float32
+# TODO(agarwal): deprecate this.
+DEFAULT_RANDN_DTYPE = onp.float32
 
 
+@np_utils.np_doc('random.seed')
+def seed(s):
+  """Sets the seed for the random number generator.
+
+  Uses `tf.set_random_seed`.
+
+  Args:
+    s: an integer.
+  """
+  try:
+    s = int(s)
+  except TypeError:
+    # TODO(wangpeng): support this?
+    raise ValueError('np.seed currently only support integer arguments.')
+  random_seed.set_seed(s)
+
+
+@np_utils.np_doc('random.randn')
 def randn(*args):
   """Returns samples from a normal distribution.
 
@@ -40,17 +65,45 @@ def randn(*args):
   # TODO(wangpeng): Use new stateful RNG
   if np_utils.isscalar(args):
     args = (args,)
+  dtype = np_dtypes.default_float_type()
+  return np_utils.tensor_to_ndarray(random_ops.random_normal(args, dtype=dtype))
+
+
+@np_utils.np_doc('random.uniform')
+def uniform(low=0.0, high=1.0, size=None):
+  dtype = np_dtypes.default_float_type()
+  low = np_array_ops.asarray(low, dtype=dtype)
+  high = np_array_ops.asarray(high, dtype=dtype)
+  if size is None:
+    size = array_ops.broadcast_dynamic_shape(low.shape, high.shape)
   return np_utils.tensor_to_ndarray(
-      random_ops.random_normal(args, dtype=DEFAULT_RANDN_DTYPE))
+      random_ops.random_uniform(
+          shape=size, minval=low, maxval=high, dtype=dtype))
 
 
-def seed(s):
-  """Sets the seed for the random number generator.
+@np_utils.np_doc('random.random')
+def random(size=None):
+  return uniform(0., 1., size)
 
-  Uses `tf.set_random_seed`.
 
-  Args:
-    s: an integer.
-  """
-  # TODO(wangpeng): make the signature the same as numpy
-  random_seed.set_seed(s)
+@np_utils.np_doc('random.rand')
+def rand(*size):
+  return uniform(0., 1., size)
+
+
+@np_utils.np_doc('random.randint')
+def randint(low, high=None, size=None, dtype=onp.int):  # pylint: disable=missing-function-docstring
+  low = int(low)
+  if high is None:
+    high = low
+    low = 0
+  if size is None:
+    size = ()
+  elif isinstance(size, int):
+    size = (size,)
+  dtype = np_utils.result_type(dtype)
+  if dtype not in (onp.int32, onp.int64):
+    raise ValueError('Only np.int32 or np.int64 types are supported')
+  return np_utils.tensor_to_ndarray(
+      random_ops.random_uniform(
+          shape=size, minval=low, maxval=high, dtype=dtype))
diff --git a/tensorflow/python/ops/numpy_ops/np_random_test.py b/tensorflow/python/ops/numpy_ops/np_random_test.py
index 3423b2234e8..95bc7606e1a 100644
--- a/tensorflow/python/ops/numpy_ops/np_random_test.py
+++ b/tensorflow/python/ops/numpy_ops/np_random_test.py
@@ -13,22 +13,138 @@
 # limitations under the License.
 # ==============================================================================
 """Tests for tf numpy random number methods."""
+# pylint: disable=g-direct-tensorflow-import
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import numpy as np
+from absl.testing import parameterized
+import numpy as onp
 from six.moves import range
 
 from tensorflow.python.framework import ops
+from tensorflow.python.ops import numpy_ops as np
 # Needed for ndarray.reshape.
 from tensorflow.python.ops.numpy_ops import np_array_ops  # pylint: disable=unused-import
+from tensorflow.python.ops.numpy_ops import np_dtypes
 from tensorflow.python.ops.numpy_ops import np_random
 from tensorflow.python.platform import test
 
 
-class RandomTest(test.TestCase):
+class SeedTest(test.TestCase):
+
+  def test(self):
+    np.random.seed(1)
+    np.random.seed(np.int32(1))
+    with self.assertRaises(ValueError):
+      np.random.seed((1, 3))
+
+
+class RandomTestBase(test.TestCase, parameterized.TestCase):
+
+  def _test(self, *args, **kw_args):
+    onp_dtype = kw_args.pop('onp_dtype', None)
+    allow_float64 = kw_args.pop('allow_float64', True)
+    old_allow_float64 = np_dtypes.is_allow_float64()
+    np_dtypes.set_allow_float64(allow_float64)
+    old_func = getattr(self, 'onp_func', None)
+    # TODO(agarwal): Note that onp can return a scalar type while np returns
+    # ndarrays. Currently np does not support scalar types.
+    self.onp_func = lambda *args, **kwargs: onp.asarray(  # pylint: disable=g-long-lambda
+        old_func(*args, **kwargs))
+    np_out = self.np_func(*args, **kw_args)
+    onp_out = onp.asarray(self.onp_func(*args, **kw_args))
+    if onp_dtype is not None:
+      onp_out = onp_out.astype(onp_dtype)
+    self.assertEqual(np_out.shape, onp_out.shape)
+    self.assertEqual(np_out.dtype, onp_out.dtype)
+    np_dtypes.set_allow_float64(old_allow_float64)
+
+
+class RandNTest(RandomTestBase):
+
+  def setUp(self):
+    self.np_func = np.random.randn
+    self.onp_func = onp.random.randn
+    super(RandNTest, self).setUp()
+
+  @parameterized.parameters((), (2), (2, 3))
+  def test_float64(self, *dims):
+    self._test(*dims)
+
+  @parameterized.parameters((), (2), ((2,)), (2, 3))
+  def test_float32(self, *dims):
+    self._test(*dims, allow_float64=False, onp_dtype=np.float32)
+
+
+class UniformTest(RandomTestBase):
+
+  def setUp(self):
+    self.np_func = np.random.uniform
+    self.onp_func = onp.random.uniform
+    super(UniformTest, self).setUp()
+
+  @parameterized.parameters(
+      ((), (), None),
+      (1, (), None),
+      ((), 1, None),
+      (1, 1, None),
+      ((1, 2), (2, 1), None),
+      ((1, 2, 1), (2, 1, 1), (2, 2, 2)),
+      ((), (), (2, 2, 2)),
+  )
+  def test_broadcast(self, low_shape, high_shape, size):
+    low = np.zeros(low_shape).astype(np.float64)
+    high = np.ones(high_shape).astype(np.float64)
+    self._test(low=low, high=high, size=size)
+
+  def test_float32(self):
+    self._test(0, 1, (1, 2), allow_float64=False, onp_dtype=np.float32)
+
+  def test_dtype_cast(self):
+    self._test(np.int8(0), np.uint8(1), (1, 2))
+
+
+class RandomTest(RandomTestBase):
+
+  def setUp(self):
+    self.np_func = np.random.random
+    self.onp_func = onp.random.random
+    super(RandomTest, self).setUp()
+
+  @parameterized.parameters((None,), ((),), ((1,),), ((1, 2),))
+  def test(self, size):
+    self._test(size)
+
+
+class RandTest(RandomTestBase):
+
+  def setUp(self):
+    self.np_func = np.random.rand
+    self.onp_func = onp.random.rand
+    super(RandTest, self).setUp()
+
+  @parameterized.parameters((), (1,), (1, 2))
+  def test(self, *size):
+    self._test(*size)
+
+
+class RandIntTest(RandomTestBase):
+
+  def setUp(self):
+    self.np_func = np.random.randint
+    self.onp_func = onp.random.randint
+    super(RandIntTest, self).setUp()
+
+  @parameterized.parameters((0, 1, None, 'l'), (0, 1, None, np.int64),
+                            (0, 1, 2, np.int32), (0, 1, (), np.int32),
+                            (0, 1, (2), np.int64), (0, 1, (2, 2), 'l'))
+  def test(self, low, high, size, dtype):
+    self._test(low, high, size=size, dtype=dtype)
+
+
+class RandNDistriutionTest(test.TestCase):
 
   def assertNotAllClose(self, a, b, **kwargs):
     try:
@@ -38,7 +154,7 @@ class RandomTest(test.TestCase):
     raise AssertionError('The two values are close at all %d elements' %
                          np.size(a))
 
-  def testRandN(self):
+  def testDistribution(self):
 
     def run_test(*args):
       num_samples = 1000
@@ -50,7 +166,9 @@ class RandomTest(test.TestCase):
       # Test output shape.
       for output in outputs:
         self.assertEqual(output.shape, tuple(args))
-        self.assertEqual(output.dtype.type, np_random.DEFAULT_RANDN_DTYPE)
+        default_dtype = (
+            np.float64 if np_dtypes.is_allow_float64() else np.float32)
+        self.assertEqual(output.dtype.type, default_dtype)
 
       if np.prod(args):  # Don't bother with empty arrays.
         outputs = [output.tolist() for output in outputs]
diff --git a/tensorflow/python/ops/numpy_ops/np_utils.py b/tensorflow/python/ops/numpy_ops/np_utils.py
index 186e56816fe..4a7d5f8fea7 100644
--- a/tensorflow/python/ops/numpy_ops/np_utils.py
+++ b/tensorflow/python/ops/numpy_ops/np_utils.py
@@ -24,13 +24,13 @@ import numpy as np
 
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import indexed_slices
-from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.numpy_ops import np_arrays
 from tensorflow.python.ops.numpy_ops import np_dtypes
+from tensorflow.python.types import core
 from tensorflow.python.util import nest
 
 tensor_to_ndarray = np_arrays.tensor_to_ndarray
@@ -43,7 +43,7 @@ def _canonicalize_axis(axis, rank):
 def _canonicalize_axes(axes, rank):
   rank = _maybe_static(rank)
 
-  if isinstance(rank, ops.Tensor):
+  if isinstance(rank, core.Tensor):
     canonicalizer = (
         lambda axis: cond(axis < 0, lambda: axis + rank, lambda: axis))
   else:
@@ -102,7 +102,7 @@ def isscalar(val):
   """Returns whether `val` is a scalar value or scalar Tensor."""
   if isinstance(val, np_arrays.ndarray):
     val = val.data
-  if isinstance(val, ops.Tensor):
+  if isinstance(val, core.Tensor):
     ndims = val.shape.ndims
     if ndims is not None:
       return ndims == 0
@@ -127,7 +127,7 @@ def result_type(*arrays_and_dtypes):
     # Don't put np.ndarray in this list, because np.result_type looks at the
     # value (not just dtype) of np.ndarray to decide the result type.
     if isinstance(
-        x, (np_arrays.ndarray, ops.Tensor, indexed_slices.IndexedSlices)):
+        x, (np_arrays.ndarray, core.Tensor, indexed_slices.IndexedSlices)):
       return _to_numpy_type(x.dtype)
     elif isinstance(x, dtypes.DType):
       return _to_numpy_type(x)
@@ -233,28 +233,71 @@ def _is_compatible_param_kind(a, b):
   return relax(a) == relax(b)
 
 
-def np_doc(np_fun, np_fun_name=None):
-  """Attachs numpy docstring to a function.
+def _prepare_np_fun_name_and_fun(np_fun_name, np_fun):
+  """Mutually propagates information between `np_fun_name` and `np_fun`.
+
+  If one is None and the other is not, we'll try to make the former not None in
+  a best effort.
 
   Args:
-    np_fun: the numpy function whose docstring will be used.
-    np_fun_name: optional name for the np_fun symbol. At least one of np_fun or
+    np_fun_name: name for the np_fun symbol. At least one of np_fun or
       np_fun_name shoud be set.
+    np_fun: the numpy function whose docstring will be used.
 
   Returns:
-    A function decorator that attaches the docstring from `np_fun` to the
-    decorated function.
+    Processed `np_fun_name` and `np_fun`.
   """
+  if np_fun_name is not None:
+    assert isinstance(np_fun_name, str)
+  if np_fun is not None:
+    assert not isinstance(np_fun, str)
   if np_fun is None:
     assert np_fun_name is not None
     try:
       np_fun = getattr(np, str(np_fun_name))
     except AttributeError:
       np_fun = None
-  np_sig = _np_signature(np_fun)
   if np_fun_name is None:
     assert np_fun is not None
     np_fun_name = np_fun.__name__
+  return np_fun_name, np_fun
+
+
+def _np_doc_helper(f, np_f, np_fun_name=None, unsupported_params=None):
+  """Helper to get docs."""
+  assert np_f or np_fun_name
+  if not np_fun_name:
+    np_fun_name = np_f.__name__
+  doc = 'TensorFlow variant of `numpy.%s`.\n\n' % np_fun_name
+  if unsupported_params:
+    doc += 'Unsupported arguments: ' + ', '.join(
+        '`' + name + '`' for name in unsupported_params) + '.\n\n'
+  if _has_docstring(f):
+    doc += f.__doc__
+    doc = _add_blank_line(doc)
+  if _has_docstring(np_f):
+    doc += 'Documentation for `numpy.%s`:\n\n' % np_f.__name__
+    # TODO(wangpeng): It looks like code snippets in numpy doc don't work
+    # correctly with doctest. Fix that and remove the reformatting of the np_f
+    # comment.
+    doc += np_f.__doc__.replace('>>>', '>')
+  return doc
+
+
+def np_doc(np_fun_name, np_fun=None):
+  """Attachs numpy docstring to a function.
+
+  Args:
+    np_fun_name: name for the np_fun symbol. At least one of np_fun or
+      np_fun_name shoud be set.
+    np_fun: (optional) the numpy function whose docstring will be used.
+
+  Returns:
+    A function decorator that attaches the docstring from `np_fun` to the
+    decorated function.
+  """
+  np_fun_name, np_fun = _prepare_np_fun_name_and_fun(np_fun_name, np_fun)
+  np_sig = _np_signature(np_fun)
 
   def decorator(f):
     """The decorator."""
@@ -294,44 +337,24 @@ def np_doc(np_fun, np_fun_name=None):
   return decorator
 
 
-def _np_doc_helper(f, np_f, np_fun_name=None, unsupported_params=None):
-  """Helper to get docs."""
-  if not unsupported_params and not _has_docstring(f) and _has_docstring(np_f):
-    # TODO(wangpeng): It looks like code snippets in numpy doc don't work
-    # correctly with doctest. Fix that and remove the reformatting of the np_f
-    # comment, here and below.
-    return np_f.__doc__.replace('>>>', '>')
-  assert np_f or np_fun_name
-  if not np_fun_name:
-    np_fun_name = np_f.__name__
-  doc = 'TensorFlow variant of `numpy.%s`.\n\n' % np_fun_name
-  if unsupported_params:
-    doc += 'Unsupported arguments: ' + ', '.join(
-        '`' + name + '`' for name in unsupported_params) + '.\n\n'
-  if _has_docstring(f):
-    doc += f.__doc__
-    doc = _add_blank_line(doc)
-  if _has_docstring(np_f):
-    doc += 'Documentation for `numpy.%s`:\n\n' % np_f.__name__
-    doc += np_f.__doc__.replace('>>>', '>')
-  return doc
-
-
-def np_doc_only(np_f):
+def np_doc_only(np_fun_name, np_fun=None):
   """Attachs numpy docstring to a function.
 
   This differs from np_doc in that it doesn't check for a match in signature.
 
   Args:
-    np_f: the numpy function whose docstring will be used.
+    np_fun_name: name for the np_fun symbol. At least one of np_fun or
+      np_fun_name shoud be set.
+    np_fun: (optional) the numpy function whose docstring will be used.
 
   Returns:
-    A function decorator that attaches the docstring from `np_f` to the
+    A function decorator that attaches the docstring from `np_fun` to the
     decorated function.
   """
+  np_fun_name, np_fun = _prepare_np_fun_name_and_fun(np_fun_name, np_fun)
 
   def decorator(f):
-    f.__doc__ = _np_doc_helper(f, np_f)
+    f.__doc__ = _np_doc_helper(f, np_fun, np_fun_name=np_fun_name)
     return f
 
   return decorator
@@ -370,7 +393,7 @@ def get_static_value(x):
     Same as `tf.get_static_value`, except that it returns None when `x` has a
     float dtype.
   """
-  if isinstance(x, ops.Tensor) and (x.dtype.is_floating or x.dtype.is_complex):
+  if isinstance(x, core.Tensor) and (x.dtype.is_floating or x.dtype.is_complex):
     return None
   return tensor_util.constant_value(x)
 
diff --git a/tensorflow/python/ops/numpy_ops/np_utils_test.py b/tensorflow/python/ops/numpy_ops/np_utils_test.py
index 6d0dfa51185..38b51f05e6e 100644
--- a/tensorflow/python/ops/numpy_ops/np_utils_test.py
+++ b/tensorflow/python/ops/numpy_ops/np_utils_test.py
@@ -31,7 +31,7 @@ class UtilsTest(test.TestCase):
       """np_fun docstring."""
       return
 
-    @np_utils.np_doc(np_fun)
+    @np_utils.np_doc(None, np_fun=np_fun)
     def f():
       """f docstring."""
       return
@@ -47,7 +47,7 @@ np_fun docstring."""
 
   def testNpDocName(self):
 
-    @np_utils.np_doc(None, np_fun_name='foo')
+    @np_utils.np_doc('foo')
     def f():
       """f docstring."""
       return
@@ -70,20 +70,20 @@ f docstring.
     # pylint: disable=unused-variable
     with self.assertRaisesRegexp(TypeError, 'Cannot find parameter'):
 
-      @np_utils.np_doc(np_fun)
+      @np_utils.np_doc(None, np_fun=np_fun)
       def f1(a):
         return
 
     with self.assertRaisesRegexp(TypeError, 'is of kind'):
 
-      @np_utils.np_doc(np_fun)
+      @np_utils.np_doc(None, np_fun=np_fun)
       def f2(x, kwargs):
         return
 
     with self.assertRaisesRegexp(TypeError,
                                  'Parameter "y" should have a default value'):
 
-      @np_utils.np_doc(np_fun)
+      @np_utils.np_doc(None, np_fun=np_fun)
       def f3(x, y):
         return
 
diff --git a/tensorflow/python/ops/parallel_for/control_flow_ops_test.py b/tensorflow/python/ops/parallel_for/control_flow_ops_test.py
index fa963cec3e4..605400aa45b 100644
--- a/tensorflow/python/ops/parallel_for/control_flow_ops_test.py
+++ b/tensorflow/python/ops/parallel_for/control_flow_ops_test.py
@@ -435,11 +435,7 @@ class NNTest(PForTestCase):
       with g:
         x1 = array_ops.gather(x, i)
         output = nn.avg_pool3d(
-            x1,
-            ksize,
-            strides=strides,
-            padding="VALID",
-            data_format="NDHWC")
+            x1, ksize, strides=strides, padding="VALID", data_format="NDHWC")
         loss = nn.l2_loss(output)
       return output, g.gradient(loss, x1)
 
@@ -488,8 +484,6 @@ class NNTest(PForTestCase):
     self._test_loop_fn(loop_fn, 3)
 
   def test_max_pool3d(self):
-    if test.is_built_with_rocm():
-      self.skipTest("Pooling with 3D tensors is not supported in ROCm")
     with backprop.GradientTape(persistent=True) as g:
       x = random_ops.random_uniform([3, 3, 2, 12, 12, 3])
       g.watch(x)
@@ -1012,12 +1006,12 @@ class TensorListTest(PForTestCase):
     # TensorListReserve operation.
     v2_enabled = control_flow_v2_toggles.control_flow_v2_enabled()
     control_flow_v2_toggles.enable_control_flow_v2()
+
     def loop_fn(i):
       handle = list_ops.tensor_list_reserve([], 2, dtypes.int32)
       _, out_handle = control_flow_ops.while_loop(
-          lambda j, _: j < 2,
-          lambda j, h: (j + 1, list_ops.tensor_list_set_item(h, j, i)),
-          (0, handle))
+          lambda j, _: j < 2, lambda j, h:
+          (j + 1, list_ops.tensor_list_set_item(h, j, i)), (0, handle))
       return list_ops.tensor_list_stack(out_handle, dtypes.int32)
 
     self._test_loop_fn(loop_fn, 2)
@@ -1140,9 +1134,8 @@ class WhileV1Test(PForTestCase):
 
     def loop_fn(_):
       return control_flow_ops.while_loop(
-          lambda j, x: j < 4,
-          lambda j, x: (j + 1, x + random_ops.random_uniform([])),
-          [0, 0.])[0]
+          lambda j, x: j < 4, lambda j, x:
+          (j + 1, x + random_ops.random_uniform([])), [0, 0.])[0]
 
     self._test_loop_fn(loop_fn, 3)
 
@@ -1150,9 +1143,8 @@ class WhileV1Test(PForTestCase):
   def test_while_unstacked_condition(self):
 
     def loop_fn(i):
-      return control_flow_ops.while_loop(
-          lambda j, x: j < 4,
-          lambda j, x: (j + 1, x + i), [0, 0])
+      return control_flow_ops.while_loop(lambda j, x: j < 4, lambda j, x:
+                                         (j + 1, x + i), [0, 0])
 
     self._test_loop_fn(loop_fn, 3)
 
@@ -1166,8 +1158,8 @@ class WhileV1Test(PForTestCase):
       lengths_i = array_ops.gather(lengths, i)
 
       _, total = control_flow_ops.while_loop(
-          lambda j, _: j < lengths_i,
-          lambda j, t: (j + 1, t + array_ops.gather(x_i, j)), [0, 0.])
+          lambda j, _: j < lengths_i, lambda j, t:
+          (j + 1, t + array_ops.gather(x_i, j)), [0, 0.])
       return total
 
     self._test_loop_fn(loop_fn, 3)
@@ -1371,6 +1363,7 @@ class WhileV2Test(PForTestCase):
     super(WhileV2Test, self).tearDown()
 
   def test_while_outside_loop(self):
+
     def _f():
       return control_flow_ops.while_loop(lambda j: j < 4, lambda j: j + 1, [0])
 
@@ -1399,9 +1392,8 @@ class WhileV2Test(PForTestCase):
 
     def loop_fn(_):
       j, _ = control_flow_ops.while_loop(
-          lambda j, x: j < 4,
-          lambda j, x: (j + 1, x + random_ops.random_uniform([])),
-          [0, 0.])
+          lambda j, x: j < 4, lambda j, x:
+          (j + 1, x + random_ops.random_uniform([])), [0, 0.])
       return j
 
     self._test_loop_fn(loop_fn, 3)
@@ -1410,9 +1402,8 @@ class WhileV2Test(PForTestCase):
     v = resource_variable_ops.ResourceVariable(5.)
 
     def loop_fn(_):
-      _, output = control_flow_ops.while_loop(
-          lambda j, x: j < 4,
-          lambda j, x: (j + 1, x + v), [0, 0.])
+      _, output = control_flow_ops.while_loop(lambda j, x: j < 4, lambda j, x:
+                                              (j + 1, x + v), [0, 0.])
       return output
 
     self._test_loop_fn(loop_fn, 3)
@@ -1420,9 +1411,8 @@ class WhileV2Test(PForTestCase):
   def test_while_unstacked_condition(self):
 
     def loop_fn(i):
-      return control_flow_ops.while_loop(
-          lambda j, x: j < 4,
-          lambda j, x: (j + 1, x + i), [0, 0])
+      return control_flow_ops.while_loop(lambda j, x: j < 4, lambda j, x:
+                                         (j + 1, x + i), [0, 0])
 
     self._test_loop_fn(loop_fn, 3)
 
@@ -1435,8 +1425,8 @@ class WhileV2Test(PForTestCase):
       lengths_i = array_ops.gather(lengths, i)
 
       return control_flow_ops.while_loop(
-          lambda j, _: j < lengths_i,
-          lambda j, t: (j + 1, t + array_ops.gather(x_i, j)), [0, 0.])
+          lambda j, _: j < lengths_i, lambda j, t:
+          (j + 1, t + array_ops.gather(x_i, j)), [0, 0.])
 
     self._test_loop_fn(loop_fn, 3)
 
@@ -1446,25 +1436,29 @@ class WhileV2Test(PForTestCase):
     # It also test inputs that are passed through.
     def loop_fn(i):
       return control_flow_ops.while_loop(
-          lambda j, *_: j < i,
-          lambda j, x, y, z, w: (j + 1, x + i, y + x, z, w),
-          [0,
-           constant_op.constant(0),
-           constant_op.constant(1),
-           i,
-           constant_op.constant(2)])
+          lambda j, *_: j < i, lambda j, x, y, z, w:
+          (j + 1, x + i, y + x, z, w), [
+              0,
+              constant_op.constant(0),
+              constant_op.constant(1), i,
+              constant_op.constant(2)
+          ])
 
     self._test_loop_fn(loop_fn, 3)
 
   def test_while_shape_invariants(self):
+
     def loop_fn(i):
       return control_flow_ops.while_loop(
           lambda j, *_: j < 4,
           lambda j, x, y: (j + 1, x + i, y + 1),
-          [0, constant_op.constant([0, 1]), constant_op.constant([2, 3])],
-          shape_invariants=[None,
-                            tensor_shape.TensorShape([2]),
-                            tensor_shape.TensorShape([2])])
+          [0, constant_op.constant([0, 1]),
+           constant_op.constant([2, 3])],
+          shape_invariants=[
+              None,
+              tensor_shape.TensorShape([2]),
+              tensor_shape.TensorShape([2])
+          ])
 
     self._test_loop_fn(loop_fn, 3)
 
@@ -1486,8 +1480,8 @@ class WhileV2Test(PForTestCase):
       if use_pfor:
         return pfor_control_flow_ops.pfor(loop_fn, iters=3)
       else:
-        return pfor_control_flow_ops.for_loop(loop_fn, iters=3,
-                                              loop_fn_dtypes=out.dtype)
+        return pfor_control_flow_ops.for_loop(
+            loop_fn, iters=3, loop_fn_dtypes=out.dtype)
 
     x = constant_op.constant(np.random.uniform(size=(1, 3)))
     y = constant_op.constant(np.random.uniform(size=(3, 3)))
@@ -1512,9 +1506,9 @@ class NestedControlFlowTest(PForTestCase):
       f = lambda x, y: (x, y)
 
     def _f(x, y):
-      return control_flow_ops.cond(y > split,
-                                   lambda: f(x, y),
-                                   lambda: (x + 1., y))
+      return control_flow_ops.cond(y > split, lambda: f(x, y), lambda:
+                                   (x + 1., y))
+
     return _f
 
   def _while(self, f=None):
@@ -1523,9 +1517,8 @@ class NestedControlFlowTest(PForTestCase):
 
     def _f(x, y):
       return control_flow_ops.while_loop(
-          lambda j, _: j < y,
-          lambda j, t: (j + 1, t + array_ops.gather(f(x, y)[0], j)),
-          [0, x])[1], y
+          lambda j, _: j < y, lambda j, t:
+          (j + 1, t + array_ops.gather(f(x, y)[0], j)), [0, x])[1], y
 
     return _f
 
@@ -1566,10 +1559,8 @@ class StatelessIfTest(PForTestCase):
       x_i = array_ops.gather(x, i)
       # Note that the output has a combination of then and else branches being
       # loop variant / invariant.
-      return cond_v2.cond_v2(
-          x_i < y,
-          lambda: (y - x_i, y, 1., 2.),
-          lambda: (x_i - y, 0., y, 3.))
+      return cond_v2.cond_v2(x_i < y, lambda: (y - x_i, y, 1., 2.), lambda:
+                             (x_i - y, 0., y, 3.))
 
     self._test_loop_fn(loop_fn, iters=5)
 
@@ -1583,10 +1574,8 @@ class StatelessIfTest(PForTestCase):
       x_i = array_ops.gather(x, i)
       # Note that the output has a combination of then and else branches being
       # loop variant / invariant.
-      return cond_v2.cond_v2(
-          z < y,
-          lambda: (y - x_i, y, 1., 2.),
-          lambda: (x_i - y, 0., y, 3.))
+      return cond_v2.cond_v2(z < y, lambda: (y - x_i, y, 1., 2.), lambda:
+                             (x_i - y, 0., y, 3.))
 
     self._test_loop_fn(loop_fn, iters=5)
 
@@ -1619,10 +1608,7 @@ class IfTest(PForTestCase):
     @def_function.function
     def loop_fn(i):
       x_i = array_ops.gather(x, i)
-      return cond_v2.cond_v2(
-          x_i < y,
-          lambda: z - x_i,
-          lambda: z + x_i)
+      return cond_v2.cond_v2(x_i < y, lambda: z - x_i, lambda: z + x_i)
 
     self._test_loop_fn(loop_fn, iters=5)
 
@@ -1736,9 +1722,8 @@ class Benchmarks(test.Benchmark):
     with ops.Graph().as_default():
 
       def loop_fn(i):
-        _, s = control_flow_ops.while_loop(lambda t, x: t < i,
-                                           lambda t, x: (t + 1, x + i),
-                                           [0, 0])
+        _, s = control_flow_ops.while_loop(lambda t, x: t < i, lambda t, x:
+                                           (t + 1, x + i), [0, 0])
         return s
 
       iters = 50
@@ -2122,5 +2107,6 @@ class VariableTest(PForTestCase):
 
     self._test_loop_fn(loop_fn, 2)
 
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/ops/parallel_for/math_test.py b/tensorflow/python/ops/parallel_for/math_test.py
index 8e18b9968fe..26bce86de73 100644
--- a/tensorflow/python/ops/parallel_for/math_test.py
+++ b/tensorflow/python/ops/parallel_for/math_test.py
@@ -90,8 +90,6 @@ class MathTest(PForTestCase, parameterized.TestCase):
         math_ops.asinh,
         math_ops.atan,
         math_ops.atanh,
-        math_ops.bessel_i0e,
-        math_ops.bessel_i1e,
         math_ops.cos,
         math_ops.cosh,
         math_ops.digamma,
@@ -107,6 +105,8 @@ class MathTest(PForTestCase, parameterized.TestCase):
         math_ops.log,
         math_ops.log1p,
         math_ops.ndtri,
+        special_math_ops.bessel_i0e,
+        special_math_ops.bessel_i1e,
     ]
     self._test_unary_cwise_ops(real_ops, False)
 
diff --git a/tensorflow/python/ops/parallel_for/pfor.py b/tensorflow/python/ops/parallel_for/pfor.py
index 29bb1afb056..ad9e83cc924 100644
--- a/tensorflow/python/ops/parallel_for/pfor.py
+++ b/tensorflow/python/ops/parallel_for/pfor.py
@@ -2703,8 +2703,18 @@ def _convert_cast(pfor_input):
 @RegisterPForWithArgs("Atan", math_ops.atan)
 @RegisterPForWithArgs("Atan2", math_ops.atan2)
 @RegisterPForWithArgs("Atanh", math_ops.atanh)
-@RegisterPForWithArgs("BesselI0e", math_ops.bessel_i0e)
-@RegisterPForWithArgs("BesselI1e", math_ops.bessel_i1e)
+@RegisterPForWithArgs("BesselI0", special_math_ops.bessel_i0)
+@RegisterPForWithArgs("BesselI1", special_math_ops.bessel_i1)
+@RegisterPForWithArgs("BesselI0e", special_math_ops.bessel_i0e)
+@RegisterPForWithArgs("BesselI1e", special_math_ops.bessel_i1e)
+@RegisterPForWithArgs("BesselK0", special_math_ops.bessel_k0)
+@RegisterPForWithArgs("BesselK1", special_math_ops.bessel_k1)
+@RegisterPForWithArgs("BesselK0e", special_math_ops.bessel_k0e)
+@RegisterPForWithArgs("BesselK1e", special_math_ops.bessel_k1e)
+@RegisterPForWithArgs("BesselJ0", special_math_ops.bessel_j0)
+@RegisterPForWithArgs("BesselJ1", special_math_ops.bessel_j1)
+@RegisterPForWithArgs("BesselY0", special_math_ops.bessel_y0)
+@RegisterPForWithArgs("BesselY1", special_math_ops.bessel_y1)
 @RegisterPForWithArgs("BitwiseAnd", bitwise_ops.bitwise_and)
 @RegisterPForWithArgs("BitwiseOr", bitwise_ops.bitwise_or)
 @RegisterPForWithArgs("BitwiseXor", bitwise_ops.bitwise_xor)
diff --git a/tensorflow/python/ops/ragged/BUILD b/tensorflow/python/ops/ragged/BUILD
index b2a02b82454..95e5602a246 100644
--- a/tensorflow/python/ops/ragged/BUILD
+++ b/tensorflow/python/ops/ragged/BUILD
@@ -1264,9 +1264,35 @@ py_test(
     srcs_version = "PY2AND3",
     deps = [
         ":ragged_array_ops",
-        "//tensorflow/python:constant_op",
+        ":ragged_factory_ops",
+        ":ragged_tensor",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:platform_test",
+        "//tensorflow/python:tensor_shape",
+        "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+py_test(
+    name = "ragged_print_op_test",
+    srcs = ["ragged_print_op_test.py"],
+    python_version = "PY3",
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged",  # fixdeps: keep
+        ":ragged_factory_ops",
+        ":ragged_string_ops",
+        ":ragged_tensor",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:logging_ops",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:sparse_ops",
+        "//tensorflow/python/eager:def_function",
         "@absl_py//absl/testing:parameterized",
     ],
 )
diff --git a/tensorflow/python/ops/ragged/ragged_batch_gather_with_default_op.py b/tensorflow/python/ops/ragged/ragged_batch_gather_with_default_op.py
index 377fd84f96e..06690f86a50 100644
--- a/tensorflow/python/ops/ragged/ragged_batch_gather_with_default_op.py
+++ b/tensorflow/python/ops/ragged/ragged_batch_gather_with_default_op.py
@@ -81,8 +81,7 @@ def batch_gather_with_default(params,
                                               return_dtype=True))
     # TODO(hterry): lift this restriction and support default_values of
     #               of rank > 1
-    if (default_value.shape.ndims is not 0
-        and default_value.shape.ndims is not 1):
+    if default_value.shape.ndims not in (0, 1):
       raise ValueError('"default_value" must be a scalar or vector')
     upper_bounds = None
     if indices.shape.ndims is None:
diff --git a/tensorflow/python/ops/ragged/ragged_dispatch.py b/tensorflow/python/ops/ragged/ragged_dispatch.py
index f13bed07ba0..5c9388b8677 100644
--- a/tensorflow/python/ops/ragged/ragged_dispatch.py
+++ b/tensorflow/python/ops/ragged/ragged_dispatch.py
@@ -29,6 +29,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import clip_ops
 from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import gen_bitwise_ops
+from tensorflow.python.ops import logging_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import parsing_ops
@@ -510,6 +511,7 @@ _RAGGED_DISPATCH_OPS = [
      ['data', 'segment_ids']),
     (math_ops.unsorted_segment_sqrt_n, ragged_math_ops.segment_sqrt_n,
      ['data', 'segment_ids']),
+    (string_ops.string_format, ragged_string_ops.string_format, ['[inputs]']),
     (string_ops.reduce_join_v2, ragged_string_ops.reduce_join, ['inputs']),
     (math_ops.reduce_sum, ragged_math_ops.reduce_sum, ['input_tensor']),
     (math_ops.reduce_prod, ragged_math_ops.reduce_prod, ['input_tensor']),
@@ -549,7 +551,7 @@ def register_dispatchers():
     RaggedDispatcher(original_op, ragged_op, args).register(original_op)
 
 
-def _ragged_op_signature(op, ragged_args):
+def _ragged_op_signature(op, ragged_args, ragged_varargs=False):
   """Returns a signature for the given op, marking ragged args in bold."""
   op_name = tf_export.get_canonical_name_for_symbol(op)
   argspec = tf_inspect.getfullargspec(op)
@@ -566,7 +568,10 @@ def _ragged_op_signature(op, ragged_args):
 
   # Add varargs and keyword args
   if argspec.varargs:
-    arg_names.append('*' + argspec.varargs)
+    if ragged_varargs:
+      arg_names.append('***' + argspec.varargs + '**')
+    else:
+      arg_names.append('*' + argspec.varargs)
   if argspec.varkw:
     arg_names.append('**' + argspec.varkw)
 
@@ -597,6 +602,8 @@ def ragged_op_list(tf_version=1):
       arginfos = _get_arg_infos(op, ragged_args)
       ragged_args = [arginfo.position for arginfo in arginfos]
       lines.append(_ragged_op_signature(op, ragged_args))
+  lines.append(
+      _ragged_op_signature(logging_ops.print_v2, [], ragged_varargs=True))
   return ('\n\n### Additional ops that support `RaggedTensor`\n\n'
           'Arguments that accept `RaggedTensor`s are marked in **bold**.\n\n' +
           '\n'.join(sorted(lines)) + 'n')
diff --git a/tensorflow/python/ops/ragged/ragged_dispatch_test.py b/tensorflow/python/ops/ragged/ragged_dispatch_test.py
index 60d9f6c8713..193e329e18a 100644
--- a/tensorflow/python/ops/ragged/ragged_dispatch_test.py
+++ b/tensorflow/python/ops/ragged/ragged_dispatch_test.py
@@ -142,8 +142,7 @@ BINARY_INT_OPS = [
 
 # pylint: disable=g-complex-comprehension
 @test_util.run_all_in_graph_and_eager_modes
-class RaggedElementwiseOpsTest(test_util.TensorFlowTestCase,
-                               parameterized.TestCase):
+class RaggedDispatchTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 
   def assertSameShape(self, x, y):
     """Checks that x and y have the same shape (including ragged shapes)."""
@@ -763,7 +762,12 @@ class RaggedElementwiseOpsTest(test_util.TensorFlowTestCase,
               'tensor': ragged_factory_ops.constant_value([[1, 2, 3], [4, 5]]),
               'axis': [0, -1]
           },
-          expected=ragged_factory_ops.constant_value([[5, 4], [3, 2, 1]]))
+          expected=ragged_factory_ops.constant_value([[5, 4], [3, 2, 1]])),
+      dict(
+          op=string_ops.string_format,
+          kwargs={'template': 'Hi {}',
+                  'inputs': [ragged_factory_ops.constant_value([[1, 2], [3]])]},
+          expected='Hi [[1, 2], [3]]'),
   ])
   def testRaggedDispatch(self, op, expected, args=(), result_is_list=False,
                          kwargs=None):
@@ -819,14 +823,14 @@ class RaggedElementwiseOpsTest(test_util.TensorFlowTestCase,
         'math.unsorted_segment_mean', 'math.unsorted_segment_min',
         'math.unsorted_segment_prod', 'math.unsorted_segment_sqrt_n',
         'math.unsorted_segment_sum', 'one_hot', 'ones_like', 'rank', 'realdiv',
-        'reduce_all', 'size', 'squeeze', 'stack', 'strings.as_string',
+        'math.reduce_all', 'size', 'squeeze', 'stack', 'strings.as_string',
         'strings.join', 'strings.length', 'strings.reduce_join',
         'strings.regex_full_match', 'strings.regex_replace', 'strings.strip',
         'strings.substr', 'strings.to_hash_bucket_fast',
         'strings.to_hash_bucket_strong', 'strings.to_hash_bucket',
         'strings.to_number', 'strings.unicode_script', 'tile', 'truncatediv',
         'truncatemod', 'zeros_like', 'dynamic_partition', 'reverse',
-        'nn.dropout',
+        'nn.dropout', 'strings.format', 'print'
     ]
 
     # Ops that should be listed as supported in v1 only.
@@ -838,15 +842,15 @@ class RaggedElementwiseOpsTest(test_util.TensorFlowTestCase,
 
     v1_ragged_ops = ragged_dispatch.ragged_op_list(tf_version=1)
     for element in supported_ops + supported_ops_v1:
-      self.assertIn(element, v1_ragged_ops)
+      self.assertIn('`tf.' + element + '`', v1_ragged_ops)
     for element in supported_ops_v2:
-      self.assertNotIn(element, v1_ragged_ops)
+      self.assertNotIn('`tf.' + element + '`', v1_ragged_ops)
 
     v2_ragged_ops = ragged_dispatch.ragged_op_list(tf_version=2)
     for element in supported_ops + supported_ops_v2:
-      self.assertIn(element, v2_ragged_ops)
+      self.assertIn('`tf.' + element + '`', v2_ragged_ops)
     for element in supported_ops_v1:
-      self.assertNotIn(element, v2_ragged_ops)
+      self.assertNotIn('`tf.' + element + '`', v2_ragged_ops)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/ops/ragged/ragged_print_op_test.py b/tensorflow/python/ops/ragged/ragged_print_op_test.py
new file mode 100644
index 00000000000..2b612d463d0
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_print_op_test.py
@@ -0,0 +1,195 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for tf.print with ragged tensors.
+
+Note: ragged support for tf.print is implemented by RaggedPrintV2Dispatcher in
+ragged_dispatch.py.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os.path
+import tempfile
+from absl.testing import parameterized
+from tensorflow.python.eager import def_function
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import logging_ops
+from tensorflow.python.ops import sparse_ops
+from tensorflow.python.ops.ragged import ragged_factory_ops
+from tensorflow.python.ops.ragged import ragged_string_ops
+from tensorflow.python.ops.ragged import ragged_tensor
+from tensorflow.python.platform import googletest
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class RaggedPrintV2Test(test_util.TensorFlowTestCase, parameterized.TestCase):
+
+  # pylint: disable=g-long-lambda
+  @parameterized.named_parameters([
+      dict(
+          testcase_name='2d_int_values',
+          inputs=lambda: [ragged_factory_ops.constant([[1, 2], [3]])],
+          expected='[[1, 2], [3]]\n'),
+      dict(
+          testcase_name='3d_int_values',
+          inputs=lambda: [ragged_factory_ops.constant([[[1, 2], [3]], [[4]]])],
+          expected='[[[1, 2], [3]], [[4]]]\n'),
+      dict(
+          testcase_name='2d_str_values',
+          inputs=lambda: [ragged_factory_ops.constant([['a', 'b'], ['c']])],
+          expected="[['a', 'b'], ['c']]\n"),
+      dict(
+          testcase_name='2d_str_values_with_escaping',
+          inputs=lambda: [ragged_factory_ops.constant([["a'b"], ['c"d']])],
+          expected="[['a\\'b'], ['c\"d']]\n"),
+      dict(
+          testcase_name='two_ragged_values',
+          inputs=lambda: [
+              ragged_factory_ops.constant([[1, 2], [3]]),
+              ragged_factory_ops.constant([[5], [], [6, 7, 8]])
+          ],
+          expected='[[1, 2], [3]] [[5], [], [6, 7, 8]]\n'),
+      dict(
+          testcase_name='ragged_value_and_non_tensor_values',
+          inputs=lambda:
+          ['a', 5, True,
+           ragged_factory_ops.constant([[1, 2], [3]]), 'c'],
+          expected='a 5 True [[1, 2], [3]] c\n'),
+      dict(
+          testcase_name='ragged_value_and_dense_value',
+          inputs=lambda: [
+              ragged_factory_ops.constant([[1, 2], [3]]),
+              constant_op.constant([[1, 2], [3, 4]])
+          ],
+          expected='[[1, 2], [3]] [[1 2]\n [3 4]]\n'),
+      dict(
+          testcase_name='ragged_value_and_sparse_value',
+          inputs=lambda: [
+              ragged_factory_ops.constant([[1, 2], [3]]),
+              sparse_ops.from_dense([[1]])
+          ],
+          expected=(
+              '[[1, 2], [3]] '
+              "'SparseTensor(indices=[[0 0]], values=[1], shape=[1 1])'\n")),
+      dict(
+          testcase_name='summarize_default',
+          inputs=lambda: [
+              ragged_factory_ops.constant([[1, 2, 3, 4, 5, 6, 7, 8, 9], [10], [
+              ], [], [], [], [11, 12]])
+          ],
+          expected=('[[1, 2, 3, ..., 7, 8, 9], [10], [], '
+                    '..., '
+                    '[], [], [11, 12]]\n')),
+      dict(
+          testcase_name='summarize_2',
+          inputs=lambda: [
+              ragged_factory_ops.constant([[1, 2, 3, 4, 5, 6, 7, 8, 9], [10], [
+              ], [], [], [], [11, 12]])
+          ],
+          summarize=2,
+          expected='[[1, 2, ..., 8, 9], [10], ..., [], [11, 12]]\n'),
+      dict(
+          testcase_name='summarize_neg1',
+          inputs=lambda: [
+              ragged_factory_ops.constant([[1, 2, 3, 4, 5, 6, 7, 8, 9], [10], [
+              ], [], [], [], [11, 12]])
+          ],
+          summarize=-1,
+          expected=('[[1, 2, 3, 4, 5, 6, 7, 8, 9], [10], '
+                    '[], [], [], [], [11, 12]]\n')),
+  ])
+  def testRaggedPrint(self, inputs, expected, summarize=None):
+    if callable(inputs):
+      inputs = inputs()
+    with tempfile.TemporaryDirectory() as tmpdirname:
+      path = os.path.join(tmpdirname, 'print_output')
+      kwargs = {'output_stream': 'file://{}'.format(path)}
+      if summarize is not None:
+        kwargs.update(summarize=summarize)
+      self.evaluate(logging_ops.print_v2(*inputs, **kwargs))
+      actual = open(path, 'r').read()
+      self.assertEqual(repr(actual), repr(expected))
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class RaggedToStringTest(test_util.TensorFlowTestCase, parameterized.TestCase):
+
+  @parameterized.named_parameters([
+      ('2d_int', [[1, 2], [], [3, 4, 5]], '[[1, 2], [], [3, 4, 5]]'),
+      ('2d_str', [['a'], ['b'], ['c', 'd']], "[['a'], ['b'], ['c', 'd']]"),
+      ('3d_int', [[[1, 2], []], [[3, 4, 5]]], '[[[1, 2], []], [[3, 4, 5]]]'),
+      ('escape', [["a'b"], [r'c\d']], r"[['a\'b'], ['c\\d']]"),
+      dict(testcase_name='2d_empty', rt=[], ragged_rank=1, expected='[]'),
+      dict(testcase_name='3d_empty', rt=[], ragged_rank=2, expected='[]'),
+      dict(
+          testcase_name='3d_rrank1',
+          rt=[[[1, 2], [3, 4]], [], [[5, 6]]],
+          ragged_rank=1,
+          expected='[[[1, 2], [3, 4]], [], [[5, 6]]]'),
+      dict(
+          testcase_name='2d_empty_row', rt=[[]], ragged_rank=1,
+          expected='[[]]'),
+      dict(
+          testcase_name='3d_empty_row', rt=[[]], ragged_rank=2,
+          expected='[[]]'),
+      dict(
+          testcase_name='summarize_1',
+          rt=[[1, 2, 3, 4, 5], [], [6], [7], [8, 9]],
+          summarize=1,
+          expected='[[1, ..., 5], ..., [8, 9]]'),
+      dict(
+          testcase_name='summarize_2',
+          rt=[[1, 2, 3, 4, 5], [], [6], [7], [8, 9]],
+          summarize=2,
+          expected='[[1, 2, ..., 4, 5], [], ..., [7], [8, 9]]'),
+  ])
+  def testRaggedToString(self, rt, expected, summarize=None, ragged_rank=None):
+    rt = ragged_factory_ops.constant(rt, ragged_rank=ragged_rank)
+    actual = ragged_string_ops.ragged_tensor_to_string(rt, summarize=summarize)
+    self.assertAllEqual(actual, expected)
+
+  @parameterized.named_parameters([
+      ('maxelts_BadType', [[1]], "Expected summarize .*, got 'foo'", 'foo'),
+      ('maxelts_0', [[1]], 'Expected summarize to be .*, got 0', 0),
+      ('maxelts_Neg2', [[1]], 'Expected summarize to be .*, got -2', -2),
+  ])
+  def testRaggedToStringErrors(self,
+                               rt,
+                               error,
+                               summarize=None,
+                               exception=ValueError):
+    rt = ragged_factory_ops.constant(rt)
+    with self.assertRaisesRegex(exception, error):
+      self.evaluate(
+          ragged_string_ops.ragged_tensor_to_string(rt, summarize=summarize))
+
+  def testRaggedToStringUnknownRank(self):
+
+    @def_function.function(
+        input_signature=[ragged_tensor.RaggedTensorSpec(ragged_rank=1)])
+    def f(rt):
+      return ragged_string_ops.ragged_tensor_to_string(rt)
+
+    with self.assertRaisesRegex(
+        ValueError, 'RaggedTensor to_string requires '
+        'that rt.shape.rank is not None'):
+      f(ragged_factory_ops.constant([[1, 2], [3]]))
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/python/ops/ragged/ragged_string_ops.py b/tensorflow/python/ops/ragged/ragged_string_ops.py
index 0d9c4d506f3..0ac23c298ba 100755
--- a/tensorflow/python/ops/ragged/ragged_string_ops.py
+++ b/tensorflow/python/ops/ragged/ragged_string_ops.py
@@ -18,10 +18,13 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_spec
+from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import gen_array_ops
+from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gen_string_ops
 from tensorflow.python.ops import string_ops
 from tensorflow.python.ops.ragged import ragged_array_ops
@@ -30,9 +33,14 @@ from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.util import compat as util_compat
 from tensorflow.python.util import deprecation
 from tensorflow.python.util import dispatch
+from tensorflow.python.util.lazy_loader import LazyLoader
 from tensorflow.python.util.tf_export import tf_export
 
 
+map_fn_lib = LazyLoader("map_fn_lib", globals(),
+                        "tensorflow.python.ops.map_fn")
+
+
 @tf_export("strings.bytes_split")
 @dispatch.add_dispatch_support
 def string_bytes_split(input, name=None):  # pylint: disable=redefined-builtin
@@ -640,7 +648,7 @@ def strings_split_v1(input=None, sep=None, maxsplit=-1,  # pylint: disable=redef
         input, dtype=dtypes.string, name="input")
 
     if input.shape.rank == 0:
-      input = gen_array_ops.expand_dims(input, 0)
+      input = array_ops.expand_dims(input, 0)
 
     if result_type == "SparseTensor":
       if input.shape.rank == 1:
@@ -813,3 +821,108 @@ def ngrams(data,
         values=output, row_splits=output_splits, validate=False)
     return array_ops.reshape(output.flat_values,
                              dense_shape) if to_tensor else output
+
+
+def string_format(template, inputs, placeholder="{}", summarize=3, name=None):
+  """Version of tf.strings.format that handles RaggedTensors."""
+  if tensor_util.is_tensor(inputs) or ragged_tensor.is_ragged(inputs):
+    inputs = [inputs]
+
+  split_template = template.split(placeholder)
+  if len(inputs) != len(split_template) - 1:
+    raise ValueError("num placeholders in template and num inputs must match"
+                     ": {} vs {}".format(len(split_template) - 1, len(inputs)))
+
+  with ops.name_scope(name, "StringFormat", [inputs]):
+    output_pieces = [constant_op.constant(split_template[0])]
+    for i, input in enumerate(inputs):
+      if ragged_tensor.is_ragged(input):
+        output_pieces.append(ragged_tensor_to_string(input, summarize))
+      else:
+        output_pieces.append(string_ops.string_format(
+            "{}", [input], summarize=summarize))
+      output_pieces.append(constant_op.constant(split_template[i + 1]))
+    if len(output_pieces) == 1:
+      return output_pieces[0]
+    else:
+      return string_ops.reduce_join(output_pieces)
+
+
+def ragged_tensor_to_string(rt, summarize=None):
+  """Returns a scalar string tensor with the contents of a RaggedTensor.
+
+  Requires that `rt.shape.rank` is not `None`.
+
+  Note: this converts the entire `RaggedTensor` into a single string scalar.
+  If you want to convert individual elements, use `tf.strings.as_string(rt)`.
+
+  >>> rt1 = tf.ragged.constant([[1, 2, 3], [4, 5]])
+  >>> ragged_tensor_to_string(rt1).numpy()
+  b'[[1, 2, 3], [4, 5]]'
+
+  >>> rt2 = tf.ragged.constant([[['a'], ['b', 'c']], [['d', 'e', 'f'], []]])
+  >>> ragged_tensor_to_string(rt2).numpy()
+  b"[[['a'], ['b', 'c']], [['d', 'e', 'f'], []]]"
+
+  >>> rt3 = tf.ragged.constant([[1], [2, 3, 4, 5, 6], [], [], [7], [8, 9]])
+  >>> ragged_tensor_to_string(rt3, summarize=2).numpy()
+  b'[[1], [2, 3, ..., 5, 6], ..., [7], [8, 9]]'
+
+  Args:
+    rt: The RaggedTensor that should be converted to a string.
+    summarize: If specified, then only the first and last `summarize` elements
+      within each dimension are included in the string. If `-1` or `None`, then
+      all elements are included.
+  """
+  if (summarize is not None and summarize != -1 and
+      not (isinstance(summarize, int) and summarize > 0)):
+    raise ValueError("Expected summarize to be -1 or a positive int, got %r" %
+                     summarize)
+  with ops.name_scope(None, "AsString", [rt]):
+    rt = ragged_tensor.convert_to_tensor_or_ragged_tensor(rt)
+    if rt.shape.rank is None:
+      raise ValueError("RaggedTensor to_string requires that rt.shape.rank "
+                       "is not None.")
+    # Convert all elements of `rt` to strings.
+    if rt.dtype == dtypes.string:
+      escaped = string_ops.regex_replace(rt.flat_values, r"(['\\])", r"\\\1")
+      str_t = rt.with_flat_values("'" + escaped + "'")
+    else:
+      str_t = rt.with_flat_values(string_ops.as_string(rt.flat_values))
+
+    return _ragged_tensor_to_string(str_t, summarize)
+
+
+def _ragged_tensor_to_string(string_tensor, summarize):
+  """Returns a scalar string tensor with the contents of `string_tensor`.
+
+  Args:
+    string_tensor: A potentially ragged tensor with dtype=string.
+    summarize: Include only the first and last `summarize` elements of each
+      dimension.  If `-1` or `None`, then include all elements.
+
+  Returns:
+    A scalar string Tensor.
+  """
+  if string_tensor.shape.rank == 1:
+    pieces = string_tensor
+  else:
+    pieces = map_fn_lib.map_fn(
+        lambda s: _ragged_tensor_to_string(s, summarize),
+        string_tensor,
+        fn_output_signature=tensor_spec.TensorSpec(None, dtypes.string))
+  if summarize not in (-1, None):
+    pieces = control_flow_ops.cond(
+        _nrows(string_tensor) <= 2 * summarize,
+        lambda: pieces,
+        lambda: array_ops.concat(  # pylint: disable=g-long-lambda
+            [pieces[:summarize], ["..."], pieces[-summarize:]],
+            axis=0))
+  return "[" + string_ops.reduce_join(pieces, separator=", ") + "]"
+
+
+def _nrows(tensor, out_type=dtypes.int32):
+  if isinstance(tensor, ragged_tensor.RaggedTensor):
+    return tensor.nrows(out_type=out_type)
+  else:
+    return array_ops.shape(tensor, out_type=out_type)[0]
diff --git a/tensorflow/python/ops/random_grad.py b/tensorflow/python/ops/random_grad.py
index 771980932cb..3caa08d96f9 100644
--- a/tensorflow/python/ops/random_grad.py
+++ b/tensorflow/python/ops/random_grad.py
@@ -18,9 +18,14 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import numpy as np
+
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import clip_ops
+from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.ops import gen_random_ops
 from tensorflow.python.ops import math_ops
 
@@ -114,3 +119,118 @@ def _StatelessRandomGammaV2Grad(op, grad):  # pylint: disable=invalid-name
     return (None, None,
             math_ops.reduce_sum(
                 grad * partial_a, axis=math_ops.range(num_sample_dimensions)))
+
+
+def _Ndtr(x):
+  """Normal distribution function."""
+  half_sqrt_2 = constant_op.constant(
+      0.5 * np.sqrt(2.), dtype=x.dtype, name="half_sqrt_2")
+  w = x * half_sqrt_2
+  z = math_ops.abs(w)
+  y = array_ops.where(
+      z < half_sqrt_2,
+      1. + math_ops.erf(w),
+      array_ops.where(
+          w > 0., 2. - math_ops.erfc(z), math_ops.erfc(z)))
+  return 0.5 * y
+
+
+@ops.RegisterGradient("StatelessParameterizedTruncatedNormal")
+def _StatelessParameterizedTruncatedNormalGrad(op, grad):  # pylint: disable=invalid-name
+  """Returns the gradient of a TruncatedNormal sample w.r.t. parameters.
+
+  The gradient is computed using implicit differentiation
+  (Figurnov et al., 2018).
+
+  Args:
+    op: A `StatelessParameterizedTruncatedNormal` operation. We assume that the
+      inputs to the operation are `shape`, `seed`, `mean`, `stddev`, `minval`,
+      and `maxval` tensors, and the output is the `sample` tensor.
+    grad: The incoming gradient `dloss / dsample` of the same shape as
+      `op.outputs[0]`.
+
+  Returns:
+    A list of `Tensor` with derivates with respect to each parameter.
+
+  References:
+    Implicit Reparameterization Gradients:
+      [Figurnov et al., 2018]
+      (http://papers.nips.cc/paper/7326-implicit-reparameterization-gradients)
+      ([pdf]
+      (http://papers.nips.cc/paper/7326-implicit-reparameterization-gradients.pdf))
+  """
+  shape = op.inputs[0]
+  mean = op.inputs[2]
+  stddev = op.inputs[3]
+  minval = op.inputs[4]
+  maxval = op.inputs[5]
+  sample = op.outputs[0]
+
+  with ops.control_dependencies([grad]):
+    minval_std = (minval - mean) / stddev
+    maxval_std = (maxval - mean) / stddev
+    sample_std = (sample - mean) / stddev
+
+    cdf_sample = (_Ndtr(sample_std) - _Ndtr(minval_std)) / (
+        _Ndtr(maxval_std) - _Ndtr(minval_std))
+
+    # Clip to avoid zero argument for log_cdf expression
+    tiny = np.finfo(mean.dtype.as_numpy_dtype).tiny
+    eps = np.finfo(mean.dtype.as_numpy_dtype).eps
+    cdf_sample = clip_ops.clip_by_value(cdf_sample, tiny, 1 - eps)
+
+    dmaxval = math_ops.exp(0.5 * (sample_std ** 2 - maxval_std ** 2) +
+                           math_ops.log(cdf_sample))
+    dminval = math_ops.exp(0.5 * (sample_std ** 2 - minval_std ** 2) +
+                           math_ops.log1p(-cdf_sample))
+    dmean = array_ops.ones_like(sample_std)
+    dstddev = sample_std
+
+    # Reduce over extra dimensions caused by `shape`. We need to get the
+    # difference in rank from shape vs. the broadcasted rank.
+
+    mean_shape = array_ops.shape(mean)
+    stddev_shape = array_ops.shape(stddev)
+    minval_shape = array_ops.shape(minval)
+    maxval_shape = array_ops.shape(maxval)
+
+    broadcast_shape = array_ops.broadcast_dynamic_shape(
+        mean_shape, stddev_shape)
+    broadcast_shape = array_ops.broadcast_dynamic_shape(
+        minval_shape, broadcast_shape)
+    broadcast_shape = array_ops.broadcast_dynamic_shape(
+        maxval_shape, broadcast_shape)
+    extra_dims = math_ops.range(
+        array_ops.size(shape) - array_ops.size(broadcast_shape))
+
+    grad_mean = math_ops.reduce_sum(grad * dmean, axis=extra_dims)
+    grad_stddev = math_ops.reduce_sum(grad * dstddev, axis=extra_dims)
+    grad_minval = math_ops.reduce_sum(grad * dminval, axis=extra_dims)
+    grad_maxval = math_ops.reduce_sum(grad * dmaxval, axis=extra_dims)
+
+    _, rmean = gen_array_ops.broadcast_gradient_args(
+        broadcast_shape, mean_shape)
+    _, rstddev = gen_array_ops.broadcast_gradient_args(
+        broadcast_shape, stddev_shape)
+    _, rminval = gen_array_ops.broadcast_gradient_args(
+        broadcast_shape, minval_shape)
+    _, rmaxval = gen_array_ops.broadcast_gradient_args(
+        broadcast_shape, maxval_shape)
+
+    grad_mean = array_ops.reshape(
+        math_ops.reduce_sum(grad_mean, axis=rmean, keepdims=True), mean_shape)
+
+    grad_stddev = array_ops.reshape(
+        math_ops.reduce_sum(grad_stddev, axis=rstddev, keepdims=True),
+        stddev_shape)
+
+    grad_minval = array_ops.reshape(
+        math_ops.reduce_sum(grad_minval, axis=rminval, keepdims=True),
+        minval_shape)
+
+    grad_maxval = array_ops.reshape(
+        math_ops.reduce_sum(grad_maxval, axis=rmaxval, keepdims=True),
+        maxval_shape)
+
+    # The first two inputs are shape.
+    return (None, None, grad_mean, grad_stddev, grad_minval, grad_maxval)
diff --git a/tensorflow/python/ops/random_ops.py b/tensorflow/python/ops/random_ops.py
index 1af91ed0dd3..6aad3fd43f0 100644
--- a/tensorflow/python/ops/random_ops.py
+++ b/tensorflow/python/ops/random_ops.py
@@ -288,8 +288,8 @@ def random_uniform(shape,
     shape = tensor_util.shape_tensor(shape)
     # In case of [0,1) floating results, minval and maxval is unused. We do an
     # `is` comparison here since this is cheaper than isinstance or  __eq__.
-    minval_is_zero = minval is 0  # pylint: disable=literal-comparison
-    maxval_is_one = maxval is 1  # pylint: disable=literal-comparison
+    minval_is_zero = isinstance(minval, int) and minval == 0
+    maxval_is_one = isinstance(maxval, int) and maxval == 1
     if not minval_is_zero or not maxval_is_one or dtype.is_integer:
       minval = ops.convert_to_tensor(minval, dtype=dtype, name="min")
       maxval = ops.convert_to_tensor(maxval, dtype=dtype, name="max")
diff --git a/tensorflow/python/ops/resource_variable_ops.py b/tensorflow/python/ops/resource_variable_ops.py
index 25f6347f034..cb235fcbe2d 100644
--- a/tensorflow/python/ops/resource_variable_ops.py
+++ b/tensorflow/python/ops/resource_variable_ops.py
@@ -633,6 +633,13 @@ class BaseResourceVariable(variables.VariableV1, core.Tensor):
     return gen_state_ops.resource_count_up_to(self.handle, limit=limit,
                                               T=self.dtype)
 
+  def _map_resources(self):
+    """For implementing `Trackable`."""
+    new_variable = copy_to_graph_uninitialized(self)
+    obj_map = {self: new_variable}
+    resource_map = {self._handle: new_variable.handle}
+    return obj_map, resource_map
+
   def _read_variable_op(self):
     variable_accessed(self)
 
diff --git a/tensorflow/python/ops/signal/window_ops.py b/tensorflow/python/ops/signal/window_ops.py
index eb33c3f3b58..26eab6b7ff9 100644
--- a/tensorflow/python/ops/signal/window_ops.py
+++ b/tensorflow/python/ops/signal/window_ops.py
@@ -27,6 +27,7 @@ from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import special_math_ops
 from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
 
@@ -87,8 +88,8 @@ def kaiser_window(window_length, beta=12., dtype=dtypes.float32, name=None):
     halflen_float = math_ops.cast(halflen_float, dtype=dtype)
     num = beta * math_ops.sqrt(
         one - math_ops.pow(arg, two) / math_ops.pow(halflen_float, two))
-    window = math_ops.exp(num - beta) * (math_ops.bessel_i0e(num) /
-                                         math_ops.bessel_i0e(beta))
+    window = math_ops.exp(num - beta) * (
+        special_math_ops.bessel_i0e(num) / special_math_ops.bessel_i0e(beta))
   return window
 
 
diff --git a/tensorflow/python/ops/sparse_ops.py b/tensorflow/python/ops/sparse_ops.py
index cc4b1010021..cee1dc23aa0 100644
--- a/tensorflow/python/ops/sparse_ops.py
+++ b/tensorflow/python/ops/sparse_ops.py
@@ -39,6 +39,7 @@ from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gen_sparse_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import special_math_ops
 # go/tf-wildcard-import
 # pylint: disable=wildcard-import
 from tensorflow.python.ops.gen_sparse_ops import *
@@ -119,7 +120,7 @@ def from_dense(tensor, name=None):
   with ops.name_scope(name, "dense_to_sparse"):
     tensor = ops.convert_to_tensor(tensor)
     indices = array_ops.where_v2(
-        math_ops.not_equal(tensor, array_ops.constant(0, tensor.dtype)))
+        math_ops.not_equal(tensor, array_ops.zeros_like(tensor)))
     values = array_ops.gather_nd(tensor, indices)
     shape = array_ops.shape(tensor, out_type=dtypes.int64)
     return sparse_tensor.SparseTensor(indices, values, shape)
@@ -2928,8 +2929,9 @@ _UNARY_OPS = [
     math_ops.sqrt,
     math_ops.erf,
     math_ops.tanh,
-    math_ops.bessel_i0e,
-    math_ops.bessel_i1e,
+    # TODO(b/157272291) Add dispatchers for rest of special functions.
+    special_math_ops.bessel_i0e,
+    special_math_ops.bessel_i1e,
 ]
 for unary_op in _UNARY_OPS:
   _UnaryMapValueDispatcher(unary_op).register(unary_op)
diff --git a/tensorflow/python/ops/sparse_ops_test.py b/tensorflow/python/ops/sparse_ops_test.py
index 91151ba8461..d321f41a85a 100644
--- a/tensorflow/python/ops/sparse_ops_test.py
+++ b/tensorflow/python/ops/sparse_ops_test.py
@@ -180,6 +180,15 @@ class SparseOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
         array_ops.transpose(dense_of_sparse))
     self.assertAllEqual(expected, result)
 
+  def testConstantStringToSparse(self):
+    # Test case for GitHub issue 40633.
+    tensor = constant_op.constant(list('ababa'))
+    sparse = sparse_ops.from_dense(tensor)
+    result = self.evaluate(sparse)
+    self.assertAllEqual([[0], [1], [2], [3], [4]], result.indices)
+    self.assertAllEqual([b'a', b'b', b'a', b'b', b'a'], result.values)
+    self.assertAllEqual([5], result.dense_shape)
+
 
 if __name__ == '__main__':
   googletest.main()
diff --git a/tensorflow/python/ops/special_math_ops.py b/tensorflow/python/ops/special_math_ops.py
index 036346cdecd..6bddd3ea9bf 100644
--- a/tensorflow/python/ops/special_math_ops.py
+++ b/tensorflow/python/ops/special_math_ops.py
@@ -250,7 +250,7 @@ def spence(x, name=None):
     return gen_special_math_ops.spence(x)
 
 
-@tf_export('math.bessel_i0')
+@tf_export('math.bessel_i0', 'math.special.bessel_i0')
 @dispatch.add_dispatch_support
 def bessel_i0(x, name=None):
   """Computes the Bessel i0 function of `x` element-wise.
@@ -259,6 +259,9 @@ def bessel_i0(x, name=None):
 
   It is preferable to use the numerically stabler function `i0e(x)` instead.
 
+  >>> tf.math.special.bessel_i0([-1., -0.5, 0.5, 1.]).numpy()
+  array([1.26606588, 1.06348337, 1.06348337, 1.26606588], dtype=float32)
+
   Args:
     x: A `Tensor` or `SparseTensor`. Must be one of the following types: `half`,
       `float32`, `float64`.
@@ -272,10 +275,36 @@ def bessel_i0(x, name=None):
   @end_compatibility
   """
   with ops.name_scope(name, 'bessel_i0', [x]):
-    return math_ops.exp(math_ops.abs(x)) * math_ops.bessel_i0e(x)
+    return gen_special_math_ops.bessel_i0(x)
 
 
-@tf_export('math.bessel_i1')
+@tf_export('math.bessel_i0e', 'math.special.bessel_i0e')
+@dispatch.add_dispatch_support
+def bessel_i0e(x, name=None):
+  """Computes the Bessel i0e function of `x` element-wise.
+
+  Modified Bessel function of order 0.
+
+  >>> tf.math.special.bessel_i0e([-1., -0.5, 0.5, 1.]).numpy()
+  array([0.46575961, 0.64503527, 0.64503527, 0.46575961], dtype=float32)
+
+  Args:
+    x: A `Tensor` or `SparseTensor`. Must be one of the following types: `half`,
+      `float32`, `float64`.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `Tensor` or `SparseTensor`, respectively. Has the same type as `x`.
+
+  @compatibility(scipy)
+  Equivalent to scipy.special.i0e
+  @end_compatibility
+  """
+  with ops.name_scope(name, 'bessel_i0e', [x]):
+    return gen_special_math_ops.bessel_i0e(x)
+
+
+@tf_export('math.bessel_i1', 'math.special.bessel_i1')
 @dispatch.add_dispatch_support
 def bessel_i1(x, name=None):
   """Computes the Bessel i1 function of `x` element-wise.
@@ -284,6 +313,9 @@ def bessel_i1(x, name=None):
 
   It is preferable to use the numerically stabler function `i1e(x)` instead.
 
+  >>> tf.math.special.bessel_i1([-1., -0.5, 0.5, 1.]).numpy()
+  array([-0.5651591 , -0.25789431,  0.25789431,  0.5651591 ], dtype=float32)
+
   Args:
     x: A `Tensor` or `SparseTensor`. Must be one of the following types: `half`,
       `float32`, `float64`.
@@ -297,7 +329,245 @@ def bessel_i1(x, name=None):
   @end_compatibility
   """
   with ops.name_scope(name, 'bessel_i1', [x]):
-    return math_ops.exp(math_ops.abs(x)) * math_ops.bessel_i1e(x)
+    return gen_special_math_ops.bessel_i1(x)
+
+
+@tf_export('math.bessel_i1e', 'math.special.bessel_i1e')
+@dispatch.add_dispatch_support
+def bessel_i1e(x, name=None):
+  """Computes the Bessel i1e function of `x` element-wise.
+
+  Modified Bessel function of order 1.
+
+  >>> tf.math.special.bessel_i1e([-1., -0.5, 0.5, 1.]).numpy()
+  array([-0.20791042, -0.15642083,  0.15642083,  0.20791042], dtype=float32)
+
+  Args:
+    x: A `Tensor` or `SparseTensor`. Must be one of the following types: `half`,
+      `float32`, `float64`.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `Tensor` or `SparseTensor`, respectively. Has the same type as `x`.
+
+  @compatibility(scipy)
+  Equivalent to scipy.special.i1e
+  @end_compatibility
+  """
+  with ops.name_scope(name, 'bessel_i1e', [x]):
+    return gen_special_math_ops.bessel_i1e(x)
+
+
+@tf_export('math.special.bessel_k0')
+@dispatch.add_dispatch_support
+def bessel_k0(x, name=None):
+  """Computes the Bessel k0 function of `x` element-wise.
+
+  Modified Bessel function of order 0.
+
+  It is preferable to use the numerically stabler function `k0e(x)` instead.
+
+  >>> tf.math.special.bessel_k0([0.5, 1., 2., 4.]).numpy()
+  array([0.92441907, 0.42102444, 0.11389387, 0.01115968], dtype=float32)
+
+  Args:
+    x: A `Tensor` or `SparseTensor`. Must be one of the following types: `half`,
+      `float32`, `float64`.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `Tensor` or `SparseTensor`, respectively. Has the same type as `x`.
+
+  @compatibility(scipy)
+  Equivalent to scipy.special.k0
+  @end_compatibility
+  """
+  with ops.name_scope(name, 'bessel_k0', [x]):
+    return gen_special_math_ops.bessel_k0(x)
+
+
+@tf_export('math.special.bessel_k0e')
+@dispatch.add_dispatch_support
+def bessel_k0e(x, name=None):
+  """Computes the Bessel k0e function of `x` element-wise.
+
+  Modified Bessel function of order 0.
+
+  >>> tf.math.special.bessel_k0e([0.5, 1., 2., 4.]).numpy()
+  array([1.52410939, 1.14446308, 0.84156822, 0.60929767], dtype=float32)
+
+  Args:
+    x: A `Tensor` or `SparseTensor`. Must be one of the following types: `half`,
+      `float32`, `float64`.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `Tensor` or `SparseTensor`, respectively. Has the same type as `x`.
+
+  @compatibility(scipy)
+  Equivalent to scipy.special.k0e
+  @end_compatibility
+  """
+  with ops.name_scope(name, 'bessel_k0e', [x]):
+    return gen_special_math_ops.bessel_k0e(x)
+
+
+@tf_export('math.special.bessel_k1')
+@dispatch.add_dispatch_support
+def bessel_k1(x, name=None):
+  """Computes the Bessel k1 function of `x` element-wise.
+
+  Modified Bessel function of order 1.
+
+  It is preferable to use the numerically stabler function `k1e(x)` instead.
+
+  >>> tf.math.special.bessel_k1([0.5, 1., 2., 4.]).numpy()
+  array([1.65644112, 0.60190723, 0.13986588, 0.0124835 ], dtype=float32)
+
+  Args:
+    x: A `Tensor` or `SparseTensor`. Must be one of the following types: `half`,
+      `float32`, `float64`.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `Tensor` or `SparseTensor`, respectively. Has the same type as `x`.
+
+  @compatibility(scipy)
+  Equivalent to scipy.special.k1
+  @end_compatibility
+  """
+  with ops.name_scope(name, 'bessel_k1', [x]):
+    return gen_special_math_ops.bessel_k1(x)
+
+
+@tf_export('math.special.bessel_k1e')
+@dispatch.add_dispatch_support
+def bessel_k1e(x, name=None):
+  """Computes the Bessel k1e function of `x` element-wise.
+
+  Modified Bessel function of order 1.
+
+  >>> tf.math.special.bessel_k1e([0.5, 1., 2., 4.]).numpy()
+  array([2.73100971, 1.63615349, 1.03347685, 0.68157595], dtype=float32)
+
+  Args:
+    x: A `Tensor` or `SparseTensor`. Must be one of the following types: `half`,
+      `float32`, `float64`.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `Tensor` or `SparseTensor`, respectively. Has the same type as `x`.
+
+  @compatibility(scipy)
+  Equivalent to scipy.special.k1e
+  @end_compatibility
+  """
+  with ops.name_scope(name, 'bessel_k1e', [x]):
+    return gen_special_math_ops.bessel_k1e(x)
+
+
+@tf_export('math.special.bessel_j0')
+@dispatch.add_dispatch_support
+def bessel_j0(x, name=None):
+  """Computes the Bessel j0 function of `x` element-wise.
+
+  Modified Bessel function of order 0.
+
+  >>> tf.math.special.bessel_j0([0.5, 1., 2., 4.]).numpy()
+  array([ 0.93846981,  0.76519769,  0.22389078, -0.39714981], dtype=float32)
+
+  Args:
+    x: A `Tensor` or `SparseTensor`. Must be one of the following types: `half`,
+      `float32`, `float64`.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `Tensor` or `SparseTensor`, respectively. Has the same type as `x`.
+
+  @compatibility(scipy)
+  Equivalent to scipy.special.j0
+  @end_compatibility
+  """
+  with ops.name_scope(name, 'bessel_j0', [x]):
+    return gen_special_math_ops.bessel_j0(x)
+
+
+@tf_export('math.special.bessel_j1')
+@dispatch.add_dispatch_support
+def bessel_j1(x, name=None):
+  """Computes the Bessel j1 function of `x` element-wise.
+
+  Modified Bessel function of order 1.
+
+  >>> tf.math.special.bessel_j1([0.5, 1., 2., 4.]).numpy()
+  array([ 0.24226846,  0.44005059,  0.57672481, -0.06604333], dtype=float32)
+
+  Args:
+    x: A `Tensor` or `SparseTensor`. Must be one of the following types: `half`,
+      `float32`, `float64`.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `Tensor` or `SparseTensor`, respectively. Has the same type as `x`.
+
+  @compatibility(scipy)
+  Equivalent to scipy.special.j1
+  @end_compatibility
+  """
+  with ops.name_scope(name, 'bessel_j1', [x]):
+    return gen_special_math_ops.bessel_j1(x)
+
+
+@tf_export('math.special.bessel_y0')
+@dispatch.add_dispatch_support
+def bessel_y0(x, name=None):
+  """Computes the Bessel y0 function of `x` element-wise.
+
+  Modified Bessel function of order 0.
+
+  >>> tf.math.special.bessel_y0([0.5, 1., 2., 4.]).numpy()
+  array([-0.44451873,  0.08825696,  0.51037567, -0.01694074], dtype=float32)
+
+  Args:
+    x: A `Tensor` or `SparseTensor`. Must be one of the following types: `half`,
+      `float32`, `float64`.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `Tensor` or `SparseTensor`, respectively. Has the same type as `x`.
+
+  @compatibility(scipy)
+  Equivalent to scipy.special.y0
+  @end_compatibility
+  """
+  with ops.name_scope(name, 'bessel_y0', [x]):
+    return gen_special_math_ops.bessel_y0(x)
+
+
+@tf_export('math.special.bessel_y1')
+@dispatch.add_dispatch_support
+def bessel_y1(x, name=None):
+  """Computes the Bessel y1 function of `x` element-wise.
+
+  Modified Bessel function of order 1.
+
+  >>> tf.math.special.bessel_y1([0.5, 1., 2., 4.]).numpy()
+  array([-1.47147239, -0.78121282, -0.10703243,  0.39792571], dtype=float32)
+
+  Args:
+    x: A `Tensor` or `SparseTensor`. Must be one of the following types: `half`,
+      `float32`, `float64`.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `Tensor` or `SparseTensor`, respectively. Has the same type as `x`.
+
+  @compatibility(scipy)
+  Equivalent to scipy.special.y1
+  @end_compatibility
+  """
+  with ops.name_scope(name, 'bessel_y1', [x]):
+    return gen_special_math_ops.bessel_y1(x)
 
 
 @ops.RegisterGradient('XlaEinsum')
diff --git a/tensorflow/python/ops/special_math_ops_test.py b/tensorflow/python/ops/special_math_ops_test.py
index 6aa6b47df6e..437997c9ce0 100644
--- a/tensorflow/python/ops/special_math_ops_test.py
+++ b/tensorflow/python/ops/special_math_ops_test.py
@@ -403,34 +403,236 @@ class SpenceTest(test.TestCase, parameterized.TestCase):
     self.assertAllClose([[[-1.]]], analytical)
 
 
-class BesselTest(test.TestCase):
+@test_util.run_all_in_graph_and_eager_modes
+class BesselTest(test.TestCase, parameterized.TestCase):
 
   @test_util.run_in_graph_and_eager_modes
-  def test_bessel_i0(self):
-    x_single = np.arange(-3, 3).reshape(1, 3, 2).astype(np.float32)
-    x_double = np.arange(-3, 3).reshape(1, 3, 2).astype(np.float64)
+  def test_besseli_boundary(self):
+    self.assertAllClose(1., special_math_ops.bessel_i0(0.))
+    self.assertAllClose(1., special_math_ops.bessel_i0e(0.))
+    self.assertAllClose(0., special_math_ops.bessel_i1(0.))
+    self.assertAllClose(0., special_math_ops.bessel_i1e(0.))
+    self.assertTrue(np.isnan(self.evaluate(special_math_ops.bessel_i0(np.nan))))
+    self.assertTrue(
+        np.isnan(self.evaluate(special_math_ops.bessel_i0e(np.nan))))
+    self.assertTrue(np.isnan(self.evaluate(special_math_ops.bessel_i1(np.nan))))
+    self.assertTrue(
+        np.isnan(self.evaluate(special_math_ops.bessel_i1e(np.nan))))
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_besselj_boundary(self):
+    self.assertAllClose(1., special_math_ops.bessel_j0(0.))
+    self.assertAllClose(0., special_math_ops.bessel_j1(0.))
+    self.assertTrue(np.isnan(self.evaluate(special_math_ops.bessel_j0(np.nan))))
+    self.assertTrue(np.isnan(self.evaluate(special_math_ops.bessel_j1(np.nan))))
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_besselk_boundary(self):
+    self.assertTrue(np.isinf(self.evaluate(special_math_ops.bessel_k0(0.))))
+    self.assertTrue(np.isinf(self.evaluate(special_math_ops.bessel_k0e(0.))))
+    self.assertTrue(np.isinf(self.evaluate(special_math_ops.bessel_k1(0.))))
+    self.assertTrue(np.isinf(self.evaluate(special_math_ops.bessel_k1e(0.))))
+    self.assertTrue(np.isnan(self.evaluate(special_math_ops.bessel_k0(np.nan))))
+    self.assertTrue(
+        np.isnan(self.evaluate(special_math_ops.bessel_k0e(np.nan))))
+    self.assertTrue(np.isnan(self.evaluate(special_math_ops.bessel_k1(np.nan))))
+    self.assertTrue(
+        np.isnan(self.evaluate(special_math_ops.bessel_k1e(np.nan))))
+
+  @parameterized.parameters(np.float32, np.float64)
+  def test_i0j0_even(self, dtype):
+    x = np.random.uniform(-100., 100., size=int(1e4)).astype(dtype)
+    self.assertAllClose(
+        self.evaluate(special_math_ops.bessel_i0(x)),
+        self.evaluate(special_math_ops.bessel_i0(-x)))
+
+    self.assertAllClose(
+        self.evaluate(special_math_ops.bessel_i0e(x)),
+        self.evaluate(special_math_ops.bessel_i0e(-x)))
+
+    self.assertAllClose(
+        self.evaluate(special_math_ops.bessel_j0(x)),
+        self.evaluate(special_math_ops.bessel_j0(-x)))
+
+  @parameterized.parameters(np.float32, np.float64)
+  def test_i1j1_odd(self, dtype):
+    x = np.random.uniform(-100., 100., size=int(1e4)).astype(dtype)
+    self.assertAllClose(
+        self.evaluate(special_math_ops.bessel_i1(x)),
+        self.evaluate(-special_math_ops.bessel_i1(-x)))
+
+    self.assertAllClose(
+        self.evaluate(special_math_ops.bessel_i1e(x)),
+        self.evaluate(-special_math_ops.bessel_i1e(-x)))
+
+    self.assertAllClose(
+        self.evaluate(special_math_ops.bessel_j1(x)),
+        self.evaluate(-special_math_ops.bessel_j1(-x)))
+
+  @parameterized.parameters(np.float32, np.float64)
+  def test_besseli_small(self, dtype):
+    x = np.random.uniform(-1., 1., size=int(1e4)).astype(dtype)
     try:
       from scipy import special  # pylint: disable=g-import-not-at-top
-      self.assertAllClose(special.i0(x_single),
-                          self.evaluate(special_math_ops.bessel_i0(x_single)))
-      self.assertAllClose(special.i0(x_double),
-                          self.evaluate(special_math_ops.bessel_i0(x_double)))
+      self.assertAllClose(
+          special.i0(x), self.evaluate(special_math_ops.bessel_i0(x)))
+      self.assertAllClose(
+          special.i1(x), self.evaluate(special_math_ops.bessel_i1(x)))
+      self.assertAllClose(
+          special.i0e(x), self.evaluate(special_math_ops.bessel_i0e(x)))
+      self.assertAllClose(
+          special.i1e(x), self.evaluate(special_math_ops.bessel_i1e(x)))
     except ImportError as e:
       tf_logging.warn('Cannot test special functions: %s' % str(e))
 
-  @test_util.run_in_graph_and_eager_modes
-  def test_bessel_i1(self):
-    x_single = np.arange(-3, 3).reshape(1, 3, 2).astype(np.float32)
-    x_double = np.arange(-3, 3).reshape(1, 3, 2).astype(np.float64)
+  @parameterized.parameters(np.float32, np.float64)
+  def test_besselj_small(self, dtype):
+    x = np.random.uniform(-1., 1., size=int(1e4)).astype(dtype)
     try:
       from scipy import special  # pylint: disable=g-import-not-at-top
-      self.assertAllClose(special.i1(x_single),
-                          self.evaluate(special_math_ops.bessel_i1(x_single)))
-      self.assertAllClose(special.i1(x_double),
-                          self.evaluate(special_math_ops.bessel_i1(x_double)))
+      self.assertAllClose(
+          special.j0(x), self.evaluate(special_math_ops.bessel_j0(x)))
+      self.assertAllClose(
+          special.j1(x), self.evaluate(special_math_ops.bessel_j1(x)))
     except ImportError as e:
       tf_logging.warn('Cannot test special functions: %s' % str(e))
 
+  @parameterized.parameters(np.float32, np.float64)
+  def test_besselk_small(self, dtype):
+    x = np.random.uniform(np.finfo(dtype).eps, 1., size=int(1e4)).astype(dtype)
+    try:
+      from scipy import special  # pylint: disable=g-import-not-at-top
+      self.assertAllClose(
+          special.k0(x), self.evaluate(special_math_ops.bessel_k0(x)))
+      self.assertAllClose(
+          special.k0e(x), self.evaluate(special_math_ops.bessel_k0e(x)))
+      self.assertAllClose(
+          special.k1(x), self.evaluate(special_math_ops.bessel_k1(x)))
+      self.assertAllClose(
+          special.k1e(x), self.evaluate(special_math_ops.bessel_k1e(x)))
+    except ImportError as e:
+      tf_logging.warn('Cannot test special functions: %s' % str(e))
+
+  @parameterized.parameters(np.float32, np.float64)
+  def test_bessely_small(self, dtype):
+    x = np.random.uniform(np.finfo(dtype).eps, 1., size=int(1e4)).astype(dtype)
+    try:
+      from scipy import special  # pylint: disable=g-import-not-at-top
+      self.assertAllClose(
+          special.y0(x), self.evaluate(special_math_ops.bessel_y0(x)))
+      self.assertAllClose(
+          special.y1(x), self.evaluate(special_math_ops.bessel_y1(x)))
+    except ImportError as e:
+      tf_logging.warn('Cannot test special functions: %s' % str(e))
+
+  @parameterized.parameters(np.float32, np.float64)
+  def test_besseli_larger(self, dtype):
+    x = np.random.uniform(1., 20., size=int(1e4)).astype(dtype)
+    try:
+      from scipy import special  # pylint: disable=g-import-not-at-top
+      self.assertAllClose(
+          special.i0e(x), self.evaluate(special_math_ops.bessel_i0e(x)))
+      self.assertAllClose(
+          special.i1e(x), self.evaluate(special_math_ops.bessel_i1e(x)))
+    except ImportError as e:
+      tf_logging.warn('Cannot test special functions: %s' % str(e))
+
+  @parameterized.parameters(np.float32, np.float64)
+  def test_besselj_larger(self, dtype):
+    x = np.random.uniform(1., 30., size=int(1e4)).astype(dtype)
+    try:
+      from scipy import special  # pylint: disable=g-import-not-at-top
+      self.assertAllClose(
+          special.j0(x), self.evaluate(special_math_ops.bessel_j0(x)))
+      self.assertAllClose(
+          special.j1(x), self.evaluate(special_math_ops.bessel_j1(x)))
+    except ImportError as e:
+      tf_logging.warn('Cannot test special functions: %s' % str(e))
+
+  @parameterized.parameters(np.float32, np.float64)
+  def test_besselk_larger(self, dtype):
+    x = np.random.uniform(1., 30., size=int(1e4)).astype(dtype)
+    try:
+      from scipy import special  # pylint: disable=g-import-not-at-top
+      self.assertAllClose(
+          special.k0(x), self.evaluate(special_math_ops.bessel_k0(x)))
+      self.assertAllClose(
+          special.k0e(x), self.evaluate(special_math_ops.bessel_k0e(x)))
+      self.assertAllClose(
+          special.k1(x), self.evaluate(special_math_ops.bessel_k1(x)))
+      self.assertAllClose(
+          special.k1e(x), self.evaluate(special_math_ops.bessel_k1e(x)))
+    except ImportError as e:
+      tf_logging.warn('Cannot test special functions: %s' % str(e))
+
+  @parameterized.parameters(np.float32, np.float64)
+  def test_bessely_larger(self, dtype):
+    x = np.random.uniform(1., 30., size=int(1e4)).astype(dtype)
+    try:
+      from scipy import special  # pylint: disable=g-import-not-at-top
+      self.assertAllClose(
+          special.y0(x), self.evaluate(special_math_ops.bessel_y0(x)))
+      self.assertAllClose(
+          special.y1(x), self.evaluate(special_math_ops.bessel_y1(x)))
+    except ImportError as e:
+      tf_logging.warn('Cannot test special functions: %s' % str(e))
+
+  def test_besseli_gradient(self):
+    inputs = [np.random.uniform(-10., 10., size=int(1e2))]
+    analytical, numerical = gradient_checker_v2.compute_gradient(
+        special_math_ops.bessel_i0, inputs)
+    self.assertLess(gradient_checker_v2.max_error(analytical, numerical), 1e-3)
+
+    analytical, numerical = gradient_checker_v2.compute_gradient(
+        special_math_ops.bessel_i0e, inputs)
+    self.assertLess(gradient_checker_v2.max_error(analytical, numerical), 1e-4)
+
+    analytical, numerical = gradient_checker_v2.compute_gradient(
+        special_math_ops.bessel_i1, inputs)
+    self.assertLess(gradient_checker_v2.max_error(analytical, numerical), 1e-3)
+
+    analytical, numerical = gradient_checker_v2.compute_gradient(
+        special_math_ops.bessel_i1e, inputs)
+    self.assertLess(gradient_checker_v2.max_error(analytical, numerical), 1e-4)
+
+  def test_besselj_gradient(self):
+    inputs = [np.random.uniform(-50., 50., size=int(1e2))]
+    analytical, numerical = gradient_checker_v2.compute_gradient(
+        special_math_ops.bessel_j0, inputs)
+    self.assertLess(gradient_checker_v2.max_error(analytical, numerical), 1e-4)
+
+    analytical, numerical = gradient_checker_v2.compute_gradient(
+        special_math_ops.bessel_j1, inputs)
+    self.assertLess(gradient_checker_v2.max_error(analytical, numerical), 1e-4)
+
+  def test_besselk_gradient(self):
+    inputs = [np.random.uniform(1., 50., size=int(1e2))]
+    analytical, numerical = gradient_checker_v2.compute_gradient(
+        special_math_ops.bessel_k0, inputs)
+    self.assertLess(gradient_checker_v2.max_error(analytical, numerical), 1e-4)
+
+    analytical, numerical = gradient_checker_v2.compute_gradient(
+        special_math_ops.bessel_k0e, inputs)
+    self.assertLess(gradient_checker_v2.max_error(analytical, numerical), 1e-4)
+
+    analytical, numerical = gradient_checker_v2.compute_gradient(
+        special_math_ops.bessel_k1, inputs)
+    self.assertLess(gradient_checker_v2.max_error(analytical, numerical), 1e-4)
+
+    analytical, numerical = gradient_checker_v2.compute_gradient(
+        special_math_ops.bessel_k1e, inputs)
+    self.assertLess(gradient_checker_v2.max_error(analytical, numerical), 1e-4)
+
+  def test_bessely_gradient(self):
+    inputs = [np.random.uniform(1., 50., size=int(1e2))]
+    analytical, numerical = gradient_checker_v2.compute_gradient(
+        special_math_ops.bessel_y0, inputs)
+    self.assertLess(gradient_checker_v2.max_error(analytical, numerical), 1e-4)
+
+    analytical, numerical = gradient_checker_v2.compute_gradient(
+        special_math_ops.bessel_y1, inputs)
+    self.assertLess(gradient_checker_v2.max_error(analytical, numerical), 1e-4)
+
 
 @test_util.run_all_in_graph_and_eager_modes
 class EinsumTest(test.TestCase):
diff --git a/tensorflow/python/ops/stateless_random_ops.py b/tensorflow/python/ops/stateless_random_ops.py
index 25fefcc514c..3e825cc4775 100644
--- a/tensorflow/python/ops/stateless_random_ops.py
+++ b/tensorflow/python/ops/stateless_random_ops.py
@@ -618,3 +618,73 @@ def stateless_multinomial_categorical_impl(logits, num_samples, dtype, seed):
   logits = ops.convert_to_tensor(logits, name="logits")
   return gen_stateless_random_ops.stateless_multinomial(
       logits, num_samples, seed, output_dtype=dtype)
+
+
+@dispatch.add_dispatch_support
+@tf_export("random.stateless_parameterized_truncated_normal")
+def stateless_parameterized_truncated_normal(shape,
+                                             seed,
+                                             means=0.0,
+                                             stddevs=1.0,
+                                             minvals=-2.0,
+                                             maxvals=2.0,
+                                             name=None):
+  """Outputs random values from a truncated normal distribution.
+
+  The generated values follow a normal distribution with specified mean and
+  standard deviation, except that values whose magnitude is more than 2 standard
+  deviations from the mean are dropped and re-picked.
+
+
+  Examples:
+
+  Sample from a Truncated normal, with deferring shape parameters that
+  broadcast.
+
+  >>> means = 0.
+  >>> stddevs = tf.math.exp(tf.random.uniform(shape=[2, 3]))
+  >>> minvals = [-1., -2., -1000.]
+  >>> maxvals = [[10000.], [1.]]
+  >>> y = tf.random.stateless_parameterized_truncated_normal(
+  ...   shape=[10, 2, 3], seed=[7, 17],
+  ...   means=means, stddevs=stddevs, minvals=minvals, maxvals=maxvals)
+  >>> y.shape
+  TensorShape([10, 2, 3])
+
+  Args:
+    shape: A 1-D integer `Tensor` or Python array. The shape of the output
+      tensor.
+    seed: A shape [2] Tensor, the seed to the random number generator. Must have
+      dtype `int32` or `int64`. (When using XLA, only `int32` is allowed.)
+    means: A `Tensor` or Python value of type `dtype`. The mean of the truncated
+      normal distribution. This must broadcast with `stddevs`, `minvals` and
+      `maxvals`, and the broadcasted shape must be dominated by `shape`.
+    stddevs: A `Tensor` or Python value of type `dtype`. The standard deviation
+      of the truncated normal distribution. This must broadcast with `means`,
+      `minvals` and `maxvals`, and the broadcasted shape must be dominated by
+      `shape`.
+    minvals: A `Tensor` or Python value of type `dtype`. The minimum value of
+      the truncated normal distribution. This must broadcast with `means`,
+      `stddevs` and `maxvals`, and the broadcasted shape must be dominated by
+      `shape`.
+    maxvals: A `Tensor` or Python value of type `dtype`. The maximum value of
+      the truncated normal distribution. This must broadcast with `means`,
+      `stddevs` and `minvals`, and the broadcasted shape must be dominated by
+      `shape`.
+    name: A name for the operation (optional).
+
+  Returns:
+    A tensor of the specified shape filled with random truncated normal values.
+  """
+  with ops.name_scope(name, "stateless_parameterized_truncated_normal",
+                      [shape, means, stddevs, minvals, maxvals]) as name:
+    shape_tensor = tensor_util.shape_tensor(shape)
+    means_tensor = ops.convert_to_tensor(means, name="means")
+    stddevs_tensor = ops.convert_to_tensor(stddevs, name="stddevs")
+    minvals_tensor = ops.convert_to_tensor(minvals, name="minvals")
+    maxvals_tensor = ops.convert_to_tensor(maxvals, name="maxvals")
+    rnd = gen_stateless_random_ops.stateless_parameterized_truncated_normal(
+        shape_tensor, seed, means_tensor, stddevs_tensor, minvals_tensor,
+        maxvals_tensor)
+    tensor_util.maybe_set_static_shape(rnd, shape)
+    return rnd
diff --git a/tensorflow/python/saved_model/load.py b/tensorflow/python/saved_model/load.py
index 74b030a3797..fb2d01cbee2 100644
--- a/tensorflow/python/saved_model/load.py
+++ b/tensorflow/python/saved_model/load.py
@@ -28,6 +28,7 @@ from tensorflow.python.eager import context
 from tensorflow.python.eager import function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
@@ -614,8 +615,16 @@ def load_internal(export_dir, tags=None, options=None, loader_cls=Loader):
     ckpt_options = checkpoint_options.CheckpointOptions(
         experimental_io_device=options.experimental_io_device)
     with ops.init_scope():
-      loader = loader_cls(object_graph_proto, saved_model_proto, export_dir,
-                          ckpt_options)
+      try:
+        loader = loader_cls(object_graph_proto, saved_model_proto, export_dir,
+                            ckpt_options)
+      except errors.NotFoundError as err:
+        raise FileNotFoundError(
+            str(err) + "\n If trying to load on a different device from the "
+            "computational device, consider using setting the "
+            "`experimental_io_device` option on tf.saved_model.LoadOptions "
+            "to the io_device such as '/job:localhost'."
+        )
       root = loader.get(0)
       if isinstance(loader, Loader):
         root.graph_debug_info = loader.adjust_debug_info_func_names(debug_info)
diff --git a/tensorflow/python/saved_model/save.py b/tensorflow/python/saved_model/save.py
index e22b0129dda..802ce1d61b7 100644
--- a/tensorflow/python/saved_model/save.py
+++ b/tensorflow/python/saved_model/save.py
@@ -19,20 +19,19 @@ from __future__ import division
 from __future__ import print_function
 
 import collections
-import copy
 import os
 
 from tensorflow.core.framework import versions_pb2
 from tensorflow.core.protobuf import meta_graph_pb2
 from tensorflow.core.protobuf import saved_model_pb2
 from tensorflow.core.protobuf import saved_object_graph_pb2
-from tensorflow.python.distribute import distribute_utils as ds_utils
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.eager import function as defun
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import error_interpolation
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import meta_graph
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
@@ -240,7 +239,7 @@ class _SaveableView(object):
     Creates resource handle ops in the current default graph, whereas
     `accessible_objects` will be from an eager context. Resource mapping adds
     resource handle ops to the main GraphDef of a SavedModel, which allows the
-    C++ loader API to interact with variables.
+    C++ loader API to interact with resources.
 
     Returns:
       A tuple of (object_map, resource_map, asset_info):
@@ -264,33 +263,15 @@ class _SaveableView(object):
         asset_index={})
 
     for node_id, obj in enumerate(self.nodes):
-      if isinstance(obj, tracking.CapturableResource):
-        new_obj = object_map[obj] = copy.copy(obj)
-        # pylint: disable=protected-access
-        with ops.device(obj._resource_device):
-          new_resource = new_obj._create_resource()
-        new_obj._resource_handle = new_resource
-        # pylint: enable=protected-access
-        resource_map[obj.resource_handle] = new_resource
-        self.captured_tensor_node_ids[obj.resource_handle] = node_id
-      elif (ds_utils.is_distributed_variable(obj) or
-            resource_variable_ops.is_resource_variable(obj)):
-        obj_to_copy = obj._primary if ds_utils.is_distributed_variable(  # pylint: disable=protected-access
-            obj) else obj
-        new_variable = resource_variable_ops.copy_to_graph_uninitialized(
-            obj_to_copy)
-        if ds_utils.is_distributed_variable(obj):
-          self.captured_tensor_node_ids[obj] = node_id
-          for v in obj.values:
-            object_map[v] = new_variable
-            resource_map[v.handle] = new_variable.handle
-            self.captured_tensor_node_ids[v.handle] = node_id
-        object_map[obj] = new_variable
-        resource_map[obj.handle] = new_variable.handle
-        self.captured_tensor_node_ids[obj.handle] = node_id
-      elif isinstance(obj, tracking.Asset):
+      if isinstance(obj, tracking.Asset):
         _process_asset(obj, asset_info, resource_map)
         self.captured_tensor_node_ids[obj.asset_path] = node_id
+      elif isinstance(obj, base.Trackable):
+        node_object_map, node_resource_map = obj._map_resources()  # pylint: disable=protected-access
+        for capturable in node_resource_map.keys():
+          self.captured_tensor_node_ids[capturable] = node_id
+        object_map.update(node_object_map)
+        resource_map.update(node_resource_map)
 
     # Note: some concrete functions can have been realized when tracing other
     # functions, and might closure-capture tensors from their parent functions.
@@ -966,7 +947,16 @@ def save(obj, export_dir, signatures=None, options=None):
   # SavedModel. Users rely on checking saved_model_dir/saved_model.pb as an
   # indication that the SavedModel is completely written.
   if context.executing_eagerly():
-    context.async_wait()  # Ensure save operations have completed.
+    try:
+      context.async_wait()  # Ensure save operations have completed.
+    except errors.NotFoundError as err:
+      raise FileNotFoundError(
+          str(err) + "\n If trying to save on a different device from the "
+          "computational device, consider using setting the "
+          "`experimental_io_device` option on tf.saved_model.SaveOptions "
+          "to the io_device such as '/job:localhost'."
+      )
+
   path = os.path.join(
       compat.as_str(export_dir),
       compat.as_str(constants.SAVED_MODEL_FILENAME_PB))
diff --git a/tensorflow/python/tfe_wrapper.cc b/tensorflow/python/tfe_wrapper.cc
index 00137f6f492..80cce331353 100644
--- a/tensorflow/python/tfe_wrapper.cc
+++ b/tensorflow/python/tfe_wrapper.cc
@@ -461,6 +461,9 @@ PYBIND11_MODULE(_pywrap_tfe, m) {
   m.def("TFE_ContextClearCaches", [](py::handle& o) {
     TFE_ContextClearCaches(tensorflow::InputTFE_Context(o));
   });
+  m.def("TFE_GetContextId", [](py::handle& ctx) {
+    return TFE_GetContextId(tensorflow::InputTFE_Context(ctx));
+  });
   m.def("TFE_ContextGetDevicePlacementPolicy", [](py::handle& ctx) {
     return TFE_ContextGetDevicePlacementPolicy(
         tensorflow::InputTFE_Context(ctx));
diff --git a/tensorflow/python/tpu/client/client.py b/tensorflow/python/tpu/client/client.py
index 972f02437d8..c834a57c153 100644
--- a/tensorflow/python/tpu/client/client.py
+++ b/tensorflow/python/tpu/client/client.py
@@ -20,6 +20,7 @@ from __future__ import division
 from __future__ import print_function
 
 import datetime
+import json
 import logging
 import os
 import time
@@ -48,6 +49,7 @@ _DISCOVERY_SERVICE_URL_ENV_VARIABLE = 'TPU_API_DISCOVERY_URL'
 _GCE_METADATA_ENDPOINT = 'http://metadata.google.internal'
 _DEFAULT_ENDPOINT_PORT = '8470'
 _OOM_EVENT_COOL_TIME_SEC = 90
+_VERSION_SWITCHER_ENDPOINT = 'http://{}:8475/requestversion'
 
 
 def _utcnow():
@@ -277,6 +279,22 @@ class Client(object):
 
   def runtime_version(self):
     """Return runtime version of the TPU."""
+
+    if not self._use_api:
+      # Fallback on getting version directly from TPU.
+      url = _VERSION_SWITCHER_ENDPOINT.format(
+          self.network_endpoints()[0]['ipAddress'])
+      try:
+        req = request.Request(url)
+        resp = request.urlopen(req)
+        version_details = json.loads(resp.read())
+        return version_details.get('currentVersion')
+      except HTTPError as e:
+        status_code = e.code
+        if status_code == 404:
+          return None
+        else:
+          raise e
     return self._get_tpu_property('tensorflowVersion')
 
   def accelerator_type(self):
@@ -350,7 +368,7 @@ class Client(object):
           be sent.
       """
       ip_address = worker['ipAddress']
-      url = 'http://{}:8475/requestversion/{}?restartType={}'.format(
+      url = (_VERSION_SWITCHER_ENDPOINT + '/{}?restartType={}').format(
           ip_address, version, restart_type)
       req = request.Request(url, data=b'')
       try:
diff --git a/tensorflow/python/tpu/client/client_test.py b/tensorflow/python/tpu/client/client_test.py
index 9d7f29ad476..f53f09cd3d5 100644
--- a/tensorflow/python/tpu/client/client_test.py
+++ b/tensorflow/python/tpu/client/client_test.py
@@ -630,6 +630,22 @@ class CloudTpuClientTest(test.TestCase):
         'http://5.6.7.8:8475/requestversion/1.15?restartType=ifNeeded'
     ], sorted(paths))
 
+  @mock.patch.object(request, 'urlopen')
+  def testGetTpuVersion(self, urlopen):
+    c = client.Client(
+        tpu='grpc://1.2.3.4:8470')
+    resp = mock.Mock()
+    resp.read.side_effect = ['{}', '{"currentVersion": "someVersion"}']
+    urlopen.return_value = resp
+    self.assertIsNone(c.runtime_version(), 'Missing key should be handled.')
+    self.assertEqual(
+        'someVersion', c.runtime_version(), 'Should return configured version.')
+    paths = [call[0][0].full_url for call in urlopen.call_args_list]
+    self.assertCountEqual([
+        'http://1.2.3.4:8475/requestversion',
+        'http://1.2.3.4:8475/requestversion',
+    ], sorted(paths))
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/tpu/client/version.py b/tensorflow/python/tpu/client/version.py
index 001059a91da..a91586640fc 100644
--- a/tensorflow/python/tpu/client/version.py
+++ b/tensorflow/python/tpu/client/version.py
@@ -18,4 +18,4 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-__version__ = "0.9"
+__version__ = "0.10"
diff --git a/tensorflow/python/tpu/tpu.bzl b/tensorflow/python/tpu/tpu.bzl
index 5453702d64d..3c26d9b49bf 100644
--- a/tensorflow/python/tpu/tpu.bzl
+++ b/tensorflow/python/tpu/tpu.bzl
@@ -25,6 +25,7 @@ def tpu_py_test(
         disable_v2 = False,
         disable_v3 = False,
         disable_experimental = False,
+        disable_mlir_bridge = True,
         args = [],
         **kwargs):
     """Generates identical unit test variants for various Cloud TPU versions.
@@ -37,6 +38,7 @@ def tpu_py_test(
         disable_v2: If true, don't generate TPU v2 tests.
         disable_v3: If true, don't generate TPU v3 tests.
         disable_experimental: Unused.
+        disable_mlir_bridge: Unused.
         args: Arguments to apply to tests.
         **kwargs: Additional named arguments to apply to tests.
     """
diff --git a/tensorflow/python/tpu/tpu.py b/tensorflow/python/tpu/tpu.py
index 28eba69b7da..6f5f0bc26c2 100644
--- a/tensorflow/python/tpu/tpu.py
+++ b/tensorflow/python/tpu/tpu.py
@@ -298,7 +298,8 @@ class TPUReplicateContext(control_flow_ops.XLAControlFlowContext):
     self._pivot = pivot
     self._replicated_vars = {}
 
-  def get_replicated_var_handle(self, name, vars_, is_mirrored=False):
+  def get_replicated_var_handle(self, name, vars_, is_mirrored=False,
+                                is_packed=False):
     """Returns a variable handle for replicated TPU variable 'var'.
 
     This is a method used by an experimental replicated variable implementation
@@ -309,6 +310,7 @@ class TPUReplicateContext(control_flow_ops.XLAControlFlowContext):
       vars_: The replicated TPU variables.
       is_mirrored: Whether the variables are mirrored, which guarantees the
         values in each replica are always the same.
+      is_packed: Whether the replicated variables are packed into one variable.
 
     Returns:
       The handle of the TPU replicated input node.
@@ -320,7 +322,7 @@ class TPUReplicateContext(control_flow_ops.XLAControlFlowContext):
     if handle is not None:
       return handle
 
-    if device_assignment is not None:
+    if device_assignment is not None and not is_packed:
       # Find a variable copy for each replica in the device assignment.
       # Note that the order of devices for replicas for the variable and the
       # device assignment might not match.
@@ -356,7 +358,8 @@ class TPUReplicateContext(control_flow_ops.XLAControlFlowContext):
       graph._set_control_flow_context(self.outer_context)
       handle = tpu_ops.tpu_replicated_input([v.handle for v in replicated_vars],
                                             name=name + "/handle",
-                                            is_mirrored_variable=is_mirrored)
+                                            is_mirrored_variable=is_mirrored,
+                                            is_packed=is_packed)
       graph._set_control_flow_context(saved_context)
       # pylint: enable=protected-access
     self._replicated_vars[name] = handle
@@ -639,6 +642,12 @@ class TPUReplicateContext(control_flow_ops.XLAControlFlowContext):
   def GetControlPivot(self):
     return self._pivot
 
+  def RequiresUniqueFunctionRetracing(self):
+    # More context: b/158152827. TPU stack uses the TPUReplicateContext to
+    # create replicated variable handles and cluster TPU computations, thus we
+    # always retrace a tf.function when the wrapped TPUReplicateContext changes.
+    return True
+
 
 class OutsideCompilationV2Context(control_flow_ops.ControlFlowContext):
   """The context for outside compilation in Tensorflow 2.0.
diff --git a/tensorflow/python/tpu/tpu_embedding_v2.py b/tensorflow/python/tpu/tpu_embedding_v2.py
index f7a383c440c..6db67fea367 100644
--- a/tensorflow/python/tpu/tpu_embedding_v2.py
+++ b/tensorflow/python/tpu/tpu_embedding_v2.py
@@ -132,7 +132,7 @@ class TPUEmbedding(tracking.AutoTrackable):
   First lets look at the `TPUStrategy` mode. Initial setup looks like:
 
   ```python
-  strategy = tf.distribute.experimental.TPUStrategy(...)
+  strategy = tf.distribute.TPUStrategy(...)
   with strategy.scope():
     embedding = tf.tpu.experimental.embedding.TPUEmbedding(
         feature_config=feature_config,
@@ -234,7 +234,7 @@ class TPUEmbedding(tracking.AutoTrackable):
     """Creates the TPUEmbedding mid level API object.
 
     ```python
-    strategy = tf.distribute.experimental.TPUStrategy(...)
+    strategy = tf.distribute.TPUStrategy(...)
     with strategy.scope():
       embedding = tf.tpu.experimental.embedding.TPUEmbedding(
           feature_config=tf.tpu.experimental.embedding.FeatureConfig(
@@ -512,7 +512,7 @@ class TPUEmbedding(tracking.AutoTrackable):
     ensure you understand the effect of applying a zero gradient.
 
     ```python
-    strategy = tf.distribute.experimental.TPUStrategy(...)
+    strategy = tf.distribute.TPUStrategy(...)
     with strategy.scope():
       embedding = tf.tpu.experimental.embedding.TPUEmbedding(...)
 
@@ -603,7 +603,7 @@ class TPUEmbedding(tracking.AutoTrackable):
     `(batch_size, max_sequence_length, dim)` instead.
 
     ```python
-    strategy = tf.distribute.experimental.TPUStrategy(...)
+    strategy = tf.distribute.TPUStrategy(...)
     with strategy.scope():
       embedding = tf.tpu.experimental.embedding.TPUEmbedding(...)
 
@@ -1024,11 +1024,8 @@ class TPUEmbedding(tracking.AutoTrackable):
   def _raise_error_for_inputs_not_on_cpu(self, features):
     """Checks all tensors in features to see are placed on the CPU."""
 
-    # expand_composites here is important, we need to check the device of each
-    # underlying tensor.
-    for path, input_tensor in nest.flatten_with_joined_string_paths(
-        features, expand_composites=True):
-      spec = tf_device.DeviceSpec.from_string(input_tensor.device)
+    def check_device(path, device_string):
+      spec = tf_device.DeviceSpec.from_string(device_string)
       if spec.device_type == "TPU":
         raise ValueError(
             "Received input tensor {} which is on a TPU input device {}. Input "
@@ -1037,7 +1034,18 @@ class TPUEmbedding(tracking.AutoTrackable):
             "setting the 'experimental_prefetch_to_device' option of the "
             "dataset distribution function. See the documentation of the "
             "enqueue method for an example.".format(
-                path, input_tensor.device))
+                path, device_string))
+
+    # expand_composites here is important, we need to check the device of each
+    # underlying tensor.
+    for path, input_tensor in nest.flatten_with_joined_string_paths(
+        features, expand_composites=True):
+      if (input_tensor.op.type == "Identity" and
+          input_tensor.op.inputs[0].op.type == "TPUReplicatedInput"):
+        for tensor in input_tensor.op.inputs[0].op.inputs:
+          check_device(path, tensor.device)
+      else:
+        check_device(path, input_tensor.device)
 
   def enqueue(self, features, weights=None, training=True, name=None):
     """Enqueues id tensors for embedding lookup.
@@ -1046,13 +1054,13 @@ class TPUEmbedding(tracking.AutoTrackable):
     embedding tables. We expect that the batch size of each of the tensors in
     features matches the per core batch size. This will automatically happen if
     your input dataset is batched to the global batch size and you use
-    `tf.distribute.experimental.TPUStrategy`'s `experimental_distribute_dataset`
+    `tf.distribute.TPUStrategy`'s `experimental_distribute_dataset`
     or if you use `experimental_distribute_datasets_from_function` and batch
     to the per core batch size computed by the context passed to your input
     function.
 
     ```python
-    strategy = tf.distribute.experimental.TPUStrategy(...)
+    strategy = tf.distribute.TPUStrategy(...)
     with strategy.scope():
       embedding = tf.tpu.experimental.embedding.TPUEmbedding(...)
 
diff --git a/tensorflow/python/tpu/tpu_embedding_v2_test.py b/tensorflow/python/tpu/tpu_embedding_v2_test.py
index ebaf2791055..ff09085f3f1 100644
--- a/tensorflow/python/tpu/tpu_embedding_v2_test.py
+++ b/tensorflow/python/tpu/tpu_embedding_v2_test.py
@@ -727,10 +727,33 @@ class TPUEmbeddingTest(parameterized.TestCase, test.TestCase):
       def get_activations():
         return mid_level_api.dequeue()
 
-      sparse_features = next(sparse_iter)
-      mid_level_api.enqueue(sparse_features, training=False)
-      sparse_activations = strategy.run(get_activations)
-      return sparse_activations
+      features = next(sparse_iter)
+      mid_level_api.enqueue(features, training=False)
+      activations = strategy.run(get_activations)
+      return activations
+
+    with self.assertRaisesRegex(ValueError, 'which is on a TPU input device'):
+      test_fn()
+
+  @parameterized.parameters([True, False])
+  def test_enqueue_cpu_tensor_with_outside_compilation(self, use_mlir):
+    if use_mlir:
+      config.enable_mlir_bridge()
+
+    strategy, mid_level_api, _ = self._create_strategy_and_mid_level('sgd')
+
+    input_fn = self._create_dense_input_fn(strategy)
+    sparse_iter = iter(strategy.experimental_distribute_datasets_from_function(
+        input_fn))
+
+    @def_function.function
+    def test_fn():
+      def get_activations(features):
+        mid_level_api.enqueue(features, training=False)
+        return mid_level_api.dequeue()
+
+      activations = strategy.run(get_activations, args=(next(sparse_iter),))
+      return activations
 
     with self.assertRaisesRegex(ValueError, 'which is on a TPU input device'):
       test_fn()
diff --git a/tensorflow/python/tpu/tpu_outside_compilation_test.py b/tensorflow/python/tpu/tpu_outside_compilation_test.py
index f7ecb294c44..54c2598324c 100644
--- a/tensorflow/python/tpu/tpu_outside_compilation_test.py
+++ b/tensorflow/python/tpu/tpu_outside_compilation_test.py
@@ -18,13 +18,18 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from absl.testing import parameterized
+import numpy as np
+
 from tensorflow.python.distribute import tpu_strategy as tpu_lib
 from tensorflow.python.distribute.cluster_resolver import tpu_cluster_resolver
 from tensorflow.python.eager import def_function
 from tensorflow.python.eager import remote
 from tensorflow.python.eager import test
 from tensorflow.python.framework import constant_op
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import logging_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import flags
 from tensorflow.python.tpu import tpu
@@ -52,7 +57,7 @@ def get_tpu_strategy():
   return tpu_lib.TPUStrategy(resolver)
 
 
-class TpuOutsideCompilationTest(test.TestCase):
+class TpuOutsideCompilationTest(test.TestCase, parameterized.TestCase):
 
   def testResourceVariableAssignOnHost(self):
     strategy = get_tpu_strategy()
@@ -79,6 +84,26 @@ class TpuOutsideCompilationTest(test.TestCase):
     self.assertAllEqual(4.0 * strategy.num_replicas_in_sync, v2.numpy())
     self.assertAllEqual(5.0, v.numpy())
 
+  def testHostNoInput(self):
+    strategy = get_tpu_strategy()
+
+    def outside_fn():
+      logging_ops.print_v2("Outside compiled")
+
+    @def_function.function
+    def train_step():
+
+      def tpu_fn(x):
+        x2 = x + 5.0
+        tpu.outside_compilation(outside_fn)
+        return x2 + 5.0
+
+      return strategy.run(tpu_fn, args=(25.0,))
+
+    self.assertAllEqual(
+        strategy.experimental_local_results(train_step()),
+        constant_op.constant(35., shape=(strategy.num_replicas_in_sync)))
+
   def testHostInputOnly(self):
     strategy = get_tpu_strategy()
 
@@ -120,13 +145,71 @@ class TpuOutsideCompilationTest(test.TestCase):
         strategy.experimental_local_results(train_step()),
         constant_op.constant(36., shape=(strategy.num_replicas_in_sync)))
 
-  def testOutsideCompilationControlFlowIf(self):
+  def testHostMultipleInputs(self):
+    strategy = get_tpu_strategy()
+    val0 = np.arange(6).reshape((2, 3)).astype(np.float32)
+    val1 = np.arange(6).reshape((3, 2)).astype(np.float32)
+
+    def outside_fn(arg0, arg1):
+      tmp = array_ops.reshape(arg1, array_ops.shape(arg0))
+      ret0 = arg0 + tmp
+      ret1 = math_ops.matmul(arg0, arg1)
+      ret2 = array_ops.concat([arg0, tmp], 0)
+      return ret0, ret1, ret2
+
+    @def_function.function
+    def train_step():
+
+      def tpu_fn(x, y):
+        a = x + 7.0
+        b = y * 2.0
+        c, d, e = tpu.outside_compilation(outside_fn, a, b)
+        return (math_ops.reduce_max(c) + math_ops.reduce_min(d) +
+                math_ops.reduce_sum(e))
+
+      return strategy.run(tpu_fn, args=(val0, val1))
+
+    self.assertAllEqual(
+        strategy.experimental_local_results(train_step()),
+        constant_op.constant(213., shape=(strategy.num_replicas_in_sync)))
+
+  def testMultipleClusters(self):
+    strategy = get_tpu_strategy()
+
+    def outside_fn1(x):
+      logging_ops.print_v2("Outside compiled", x)
+      return x + 6.0
+
+    def outside_fn2(x):
+      logging_ops.print_v2("Outside compiled", x)
+      return x - 18.0
+
+    @def_function.function
+    def train_step():
+
+      def tpu_fn(x):
+        x2 = x + 5.0
+        output1 = tpu.outside_compilation(outside_fn1, x2)
+        x3 = output1 + 3.0
+        output2 = tpu.outside_compilation(outside_fn2, x3)
+        return output2
+
+      return strategy.run(tpu_fn, args=(25.0,))
+
+    self.assertAllEqual(
+        strategy.experimental_local_results(train_step()),
+        constant_op.constant(21., shape=(strategy.num_replicas_in_sync)))
+
+  @parameterized.parameters((True), (False))
+  def testOutsideCompilationControlFlowIf(self, take_true_branch):
     strategy = get_tpu_strategy()
 
     def outside_fn(x):
       logging_ops.print_v2("Outside compiled", x)
       return x + 6.0
 
+    input_value = 51.0 if take_true_branch else 25.0
+
     @def_function.function
     def train_step():
 
@@ -137,11 +220,15 @@ class TpuOutsideCompilationTest(test.TestCase):
         else:
           return x2
 
-      return strategy.run(tpu_fn, args=(25.0,))
+      return strategy.run(tpu_fn, args=(input_value,))
 
+    output_value = 36.0
+    if take_true_branch:
+      output_value = 56.0
     self.assertAllEqual(
         strategy.experimental_local_results(train_step()),
-        constant_op.constant(36., shape=(strategy.num_replicas_in_sync)))
+        constant_op.constant(
+            output_value, shape=(strategy.num_replicas_in_sync)))
 
   def testOutsideCompilationControlFlowWhile(self):
     strategy = get_tpu_strategy()
diff --git a/tensorflow/python/tpu/tpu_strategy_util.py b/tensorflow/python/tpu/tpu_strategy_util.py
index 543c91167cd..c315d7c5e1b 100644
--- a/tensorflow/python/tpu/tpu_strategy_util.py
+++ b/tensorflow/python/tpu/tpu_strategy_util.py
@@ -24,6 +24,7 @@ from tensorflow.python.distribute.cluster_resolver.tpu_cluster_resolver import T
 from tensorflow.python.eager import context
 from tensorflow.python.eager import function
 from tensorflow.python.framework import device
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.tpu import topology
@@ -47,8 +48,8 @@ def initialize_tpu_system(cluster_resolver=None):
     The tf.tpu.Topology object for the topology of the TPU cluster.
 
   Raises:
-    RuntimeError: If no TPU devices found for eager execution or if run in a
-        tf.function.
+    RuntimeError: If running inside a tf.function.
+    NotFoundError: If no TPU devices found in eager mode.
   """
   job = None
   if cluster_resolver is None:
@@ -93,8 +94,15 @@ def initialize_tpu_system(cluster_resolver=None):
     # The TPU_SYSTEM device must match the device used in tpu.initialize_system
     # exactly, otherwise you can get errors if there are multiple TPU_SYSTEM
     # devices available.
-    with ops.device(tpu._tpu_system_device_name(job)):  # pylint: disable=protected-access
-      output = _tpu_init_fn()
+    try:
+      with ops.device(tpu._tpu_system_device_name(job)):  # pylint: disable=protected-access
+        output = _tpu_init_fn()
+      context.async_wait()
+    except errors.InvalidArgumentError as e:
+      raise errors.NotFoundError(
+          None, None,
+          "TPUs not found in the cluster. Failed in initialization: "
+          + str(e))
 
     # Clear out the eager context caches since the memory is invalid now.
     logging.info("Clearing out eager caches")
diff --git a/tensorflow/python/training/saving/BUILD b/tensorflow/python/training/saving/BUILD
index 670a4c35c6f..12940840309 100644
--- a/tensorflow/python/training/saving/BUILD
+++ b/tensorflow/python/training/saving/BUILD
@@ -43,6 +43,7 @@ cuda_py_test(
         ":checkpoint_options",
         ":functional_saver",
         ":saveable_hook",
+        "//tensorflow/python/eager:remote",
         "//tensorflow/python/eager:test",
     ],
 )
diff --git a/tensorflow/python/training/saving/functional_saver.py b/tensorflow/python/training/saving/functional_saver.py
index c4334e096df..3a9b565470d 100644
--- a/tensorflow/python/training/saving/functional_saver.py
+++ b/tensorflow/python/training/saving/functional_saver.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import uuid
 
 from tensorflow.core.protobuf import saver_pb2
+from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -161,7 +162,8 @@ class MultiDeviceSaver(object):
         self._after_restore_callbacks.append(saveable.after_restore)
 
       if is_saveable:
-        saveables_by_device.setdefault(saveable.device, []).append(saveable)
+        host_device = saveable_object_util.set_cpu0(saveable.device)
+        saveables_by_device.setdefault(host_device, []).append(saveable)
 
     self._single_device_savers = {
         device: _SingleDeviceSaver(saveables)
@@ -247,33 +249,50 @@ class MultiDeviceSaver(object):
       tmp_checkpoint_prefix = string_ops.string_join(
           [file_prefix, sharded_suffix])
 
-    num_shards = len(self._single_device_savers)
-    sharded_saves = []
-    sharded_prefixes = []
-    num_shards_tensor = constant_op.constant(num_shards, name="num_shards")
-    last_device = None
-    for shard, (device, saver) in enumerate(
-        sorted(self._single_device_savers.items())):
-      last_device = device
-      with ops.device(saveable_object_util.set_cpu0(device)):
-        shard_prefix = sharded_filename(tmp_checkpoint_prefix, shard,
-                                        num_shards_tensor)
-      sharded_prefixes.append(shard_prefix)
-      with ops.device(device):
-        # _SingleDeviceSaver will use the CPU device when necessary, but initial
-        # read operations should be placed on the SaveableObject's device.
-        sharded_saves.append(saver.save(shard_prefix, options))
+    def save_fn():
+      num_shards = len(self._single_device_savers)
+      sharded_saves = []
+      sharded_prefixes = []
+      num_shards_tensor = constant_op.constant(num_shards, name="num_shards")
+      last_device = None
+      for shard, (device, saver) in enumerate(
+          sorted(self._single_device_savers.items())):
+        last_device = device
+        with ops.device(saveable_object_util.set_cpu0(device)):
+          shard_prefix = sharded_filename(tmp_checkpoint_prefix, shard,
+                                          num_shards_tensor)
+        sharded_prefixes.append(shard_prefix)
+        with ops.device(device):
+          # _SingleDeviceSaver will use the CPU device when necessary, but
+          # initial read operations should be placed on the SaveableObject's
+          # device.
+          sharded_saves.append(saver.save(shard_prefix, options))
 
-    with ops.control_dependencies(sharded_saves):
-      # Merge on the io_device if specified, otherwise co-locates the merge op
-      # with the last device used.
-      merge_device = (options.experimental_io_device or
-                      saveable_object_util.set_cpu0(last_device))
-      with ops.device(merge_device):
-        # V2 format write path consists of a metadata merge step.  Once merged,
-        # attempts to delete the temporary directory, "<user-fed prefix>_temp".
-        return gen_io_ops.merge_v2_checkpoints(
-            sharded_prefixes, file_prefix, delete_old_dirs=True)
+      with ops.control_dependencies(sharded_saves):
+        # Merge on the io_device if specified, otherwise co-locates the merge op
+        # with the last device used.
+        merge_device = (
+            options.experimental_io_device or
+            saveable_object_util.set_cpu0(last_device))
+        with ops.device(merge_device):
+          # V2 format write path consists of a metadata merge step.  Once
+          # merged, attempts to delete the temporary directory,
+          # "<user-fed prefix>_temp".
+          return gen_io_ops.merge_v2_checkpoints(
+              sharded_prefixes, file_prefix, delete_old_dirs=True)
+
+    # Since this will causes a function re-trace on each save, limit this to the
+    # cases where it is needed: eager and when there are multiple tasks/single
+    # device savers. Note that the retrace is needed to ensure we pickup the
+    # latest values of options like experimental_io_device.
+    if context.executing_eagerly() and len(self._single_device_savers) > 1:
+      # Explicitly place the identity op on the first device.
+      @def_function.function(experimental_compile=False)
+      def tf_function_save():
+        save_fn()
+      tf_function_save()
+    else:
+      return save_fn()
 
   def restore(self, file_prefix, options=None):
     """Restore the saveable objects from a checkpoint with `file_prefix`.
@@ -287,12 +306,38 @@ class MultiDeviceSaver(object):
       A dictionary mapping from SaveableObject names to restore operations.
     """
     options = options or checkpoint_options.CheckpointOptions()
-    restore_ops = {}
-    # Sort by device name to avoid propagating non-deterministic dictionary
-    # ordering in some Python versions.
-    for device, saver in sorted(self._single_device_savers.items()):
-      with ops.device(device):
-        restore_ops.update(saver.restore(file_prefix, options))
+
+    def restore_fn():
+      restore_ops = {}
+      # Sort by device name to avoid propagating non-deterministic dictionary
+      # ordering in some Python versions.
+      for device, saver in sorted(self._single_device_savers.items()):
+        with ops.device(device):
+          restore_ops.update(saver.restore(file_prefix, options))
+
+      return restore_ops
+
+    # Since this will causes a function re-trace on each save, limit this to the
+    # cases where it is needed: eager and when there are multiple tasks/single
+    # device savers. Note that the retrace is needed to ensure we pickup the
+    # latest values of options like experimental_io_device.
+    if context.executing_eagerly() and len(self._single_device_savers) > 1:
+      first_device, _ = list(self._single_device_savers.items())[0]
+      @def_function.function(experimental_compile=False)
+      def tf_function_restore():
+        restore_ops = restore_fn()
+        restore_tensors = {}
+        # tf.functions must return tensors, thus we use control dependencies so
+        # that we can return a tensor which depends on the given op.
+        with ops.device(saveable_object_util.set_cpu0(first_device)):
+          for name, op in restore_ops.items():
+            with ops.control_dependencies([op]):
+              restore_tensors[name] = array_ops.identity(file_prefix)
+        return restore_tensors
+
+      restore_ops = tf_function_restore()
+    else:
+      restore_ops = restore_fn()
 
     for callback in self._after_restore_callbacks:
       callback()
diff --git a/tensorflow/python/training/saving/functional_saver_test.py b/tensorflow/python/training/saving/functional_saver_test.py
index 7db32ff72d7..8f3eef4fb9c 100644
--- a/tensorflow/python/training/saving/functional_saver_test.py
+++ b/tensorflow/python/training/saving/functional_saver_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import os
 
 from tensorflow.python.eager import context
+from tensorflow.python.eager import remote
 from tensorflow.python.eager import test
 from tensorflow.python.eager import wrap_function
 from tensorflow.python.framework import config
@@ -29,6 +30,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.platform import gfile
+from tensorflow.python.training import server_lib
 from tensorflow.python.training.saving import checkpoint_options
 from tensorflow.python.training.saving import functional_saver
 from tensorflow.python.training.saving import saveable_hook
@@ -126,13 +128,16 @@ class SaverTest(test.TestCase):
     second_saver.restore(save_path)
     self.assertEqual(2., self.evaluate(v2))
 
-  @test_util.run_in_graph_and_eager_modes
-  def test_checkpoint_is_sharded_by_device(self):
-    with ops.device("cpu:0"):
+  def test_checkpoint_is_sharded_by_task(self):
+    servers = [server_lib.Server.create_local_server() for _ in range(3)]
+    cluster_spec = server_lib.ClusterSpec({
+        "worker": [s.target[len("grpc://"):] for s in servers]})
+    remote.connect_to_cluster(cluster_spec)
+    with ops.device("/job:worker/task:0/cpu:0"):
       v0 = resource_variable_ops.ResourceVariable(0.)
-    with ops.device("cpu:1"):
+    with ops.device("/job:worker/task:1/cpu:0"):
       v1 = resource_variable_ops.ResourceVariable(1.)
-    with ops.device("cpu:2"):
+    with ops.device("/job:worker/task:2/cpu:0"):
       v2 = resource_variable_ops.ResourceVariable(2.)
 
     self.evaluate([v0.initializer, v1.initializer, v2.initializer])
@@ -167,7 +172,7 @@ class SaverTest(test.TestCase):
         list(saveable_object_util.saveable_objects_for_op(v2, "v2")))
     prefix = os.path.join(self.get_temp_dir(), "ckpt")
     self.evaluate(saver.save(constant_op.constant(prefix), self.local_options))
-    self.assertEqual(4, len(gfile.Glob(prefix + "*")))
+    self.assertEqual(2, len(gfile.Glob(prefix + "*")))
     self.evaluate(v0.assign(-1.))
     self.evaluate(v1.assign(-1.))
     self.evaluate(v2.assign(-1.))
diff --git a/tensorflow/python/training/tracking/base.py b/tensorflow/python/training/tracking/base.py
index e3cd9828724..ea76ad8db47 100644
--- a/tensorflow/python/training/tracking/base.py
+++ b/tensorflow/python/training/tracking/base.py
@@ -1021,3 +1021,21 @@ class Trackable(object):
     """
     del serialization_cache
     return dict()
+
+  def _map_resources(self):
+    """Makes new resource handle ops corresponding to existing resource tensors.
+
+    Internal sub-classes can override this to inform model saving how to add new
+    resource handle ops to the main GraphDef of a SavedModel (TF 1.x style
+    graph), which allows session based APIs (e.g, C++ loader API) to interact
+    with resources owned by this object.
+
+    Returns:
+      A tuple of (object_map, resource_map):
+        object_map: A dictionary mapping from objects that hold existing
+          resource tensors to replacement objects created to hold the new
+          resource tensors.
+        resource_map: A dictionary mapping from existing resource tensors to
+          newly created resource tensors.
+    """
+    return {}, {}
diff --git a/tensorflow/python/training/tracking/tracking.py b/tensorflow/python/training/tracking/tracking.py
index 553f0ec73bf..fb2735e6445 100644
--- a/tensorflow/python/training/tracking/tracking.py
+++ b/tensorflow/python/training/tracking/tracking.py
@@ -17,6 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import copy
 import functools
 import weakref
 
@@ -243,6 +244,18 @@ class CapturableResource(base.Trackable):
         self._resource_handle = self._create_resource()
     return self._resource_handle
 
+  def _map_resources(self):
+    """For implementing `Trackable`."""
+    new_obj = copy.copy(self)
+    # pylint: disable=protected-access
+    with ops.device(self._resource_device):
+      new_resource = new_obj._create_resource()
+    new_obj._resource_handle = new_resource
+    # pylint: enable=protected-access
+    obj_map = {self: new_obj}
+    resource_map = {self.resource_handle: new_resource}
+    return obj_map, resource_map
+
   def _list_functions_for_serialization(self, unused_functions):
     @def_function.function(input_signature=[], autograph=False)
     def _creator():
diff --git a/tensorflow/python/util/tf32.cc b/tensorflow/python/util/tf32.cc
new file mode 100644
index 00000000000..7dece6ccdae
--- /dev/null
+++ b/tensorflow/python/util/tf32.cc
@@ -0,0 +1,22 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "pybind11/pybind11.h"
+#include "tensorflow/core/platform/tf32_utils.h"
+
+PYBIND11_MODULE(_pywrap_tf32_execution, m) {
+  m.def("allow", &tensorflow::allow_tf32_execution);
+  m.def("is_allowed", &tensorflow::tf32_execution_allowed);
+}
diff --git a/tensorflow/python/util/util.cc b/tensorflow/python/util/util.cc
index 1d0dd695d74..cf8581443e7 100644
--- a/tensorflow/python/util/util.cc
+++ b/tensorflow/python/util/util.cc
@@ -29,15 +29,25 @@ limitations under the License.
 namespace tensorflow {
 namespace swig {
 
-std::unordered_map<string, PyObject*>* PythonTypesMap() {
+namespace {
+string PyObjectToString(PyObject* o);
+}  // namespace
+
+std::unordered_map<string, PyObject*>* RegisteredPyObjectMap() {
   static auto* m = new std::unordered_map<string, PyObject*>();
   return m;
 }
 
-PyObject* GetRegisteredType(const string& key) {
-  auto* m = PythonTypesMap();
-  auto it = m->find(key);
-  if (it == m->end()) return nullptr;
+PyObject* GetRegisteredPyObject(const string& name) {
+  const auto* m = RegisteredPyObjectMap();
+  auto it = m->find(name);
+  if (it == m->end()) {
+    PyErr_SetString(PyExc_TypeError,
+                    tensorflow::strings::StrCat("No object with name ", name,
+                                                " has been registered.")
+                        .c_str());
+    return nullptr;
+  }
   return it->second;
 }
 
@@ -49,26 +59,35 @@ PyObject* RegisterType(PyObject* type_name, PyObject* type) {
                         .c_str());
     return nullptr;
   }
+  return RegisterPyObject(type_name, type);
+}
 
+PyObject* RegisterPyObject(PyObject* name, PyObject* value) {
   string key;
-  if (PyBytes_Check(type_name)) {
-    key = PyBytes_AsString(type_name);
-  }
+  if (PyBytes_Check(name)) {
+    key = PyBytes_AsString(name);
 #if PY_MAJOR_VERSION >= 3
-  if (PyUnicode_Check(type_name)) {
-    key = PyUnicode_AsUTF8(type_name);
-  }
+  } else if (PyUnicode_Check(name)) {
+    key = PyUnicode_AsUTF8(name);
 #endif
-
-  if (PythonTypesMap()->find(key) != PythonTypesMap()->end()) {
+  } else {
     PyErr_SetString(PyExc_TypeError, tensorflow::strings::StrCat(
-                                         "Type already registered for ", key)
+                                         "Expected name to be a str, got",
+                                         PyObjectToString(name))
                                          .c_str());
     return nullptr;
   }
 
-  Py_INCREF(type);
-  PythonTypesMap()->emplace(key, type);
+  auto* m = RegisteredPyObjectMap();
+  if (m->find(key) != m->end()) {
+    PyErr_SetString(PyExc_TypeError, tensorflow::strings::StrCat(
+                                         "Value already registered for ", key)
+                                         .c_str());
+    return nullptr;
+  }
+
+  Py_INCREF(value);
+  m->emplace(key, value);
 
   Py_RETURN_NONE;
 }
@@ -196,7 +215,7 @@ class CachedTypeCheck {
 // Returns 0 otherwise.
 // Returns -1 if an error occurred (e.g., if 'type_name' is not registered.)
 int IsInstanceOfRegisteredType(PyObject* obj, const char* type_name) {
-  PyObject* type_obj = GetRegisteredType(type_name);
+  PyObject* type_obj = GetRegisteredPyObject(type_name);
   if (TF_PREDICT_FALSE(type_obj == nullptr)) {
     PyErr_SetString(PyExc_RuntimeError,
                     tensorflow::strings::StrCat(
@@ -513,7 +532,8 @@ class AttrsValueIterator : public ValueIterator {
 };
 
 bool IsSparseTensorValueType(PyObject* o) {
-  PyObject* sparse_tensor_value_type = GetRegisteredType("SparseTensorValue");
+  PyObject* sparse_tensor_value_type =
+      GetRegisteredPyObject("SparseTensorValue");
   if (TF_PREDICT_FALSE(sparse_tensor_value_type == nullptr)) {
     return false;
   }
diff --git a/tensorflow/python/util/util.h b/tensorflow/python/util/util.h
index 23438b43c53..fc0b864416e 100644
--- a/tensorflow/python/util/util.h
+++ b/tensorflow/python/util/util.h
@@ -19,6 +19,8 @@ limitations under the License.
 
 #include <Python.h>
 
+#include <string>
+
 namespace tensorflow {
 namespace swig {
 
@@ -270,11 +272,20 @@ PyObject* FlattenForData(PyObject* nested);
 PyObject* AssertSameStructureForData(PyObject* o1, PyObject* o2,
                                      bool check_types);
 
-// RegisterType is used to pass PyTypeObject (which is defined in python) for an
-// arbitrary identifier `type_name` into C++.
+// Registers a Python object so it can be looked up from c++.  The set of
+// valid names, and the expected values for those names, are listed in
+// the documentation for `RegisteredPyObjects`.  Returns PyNone.
+PyObject* RegisterPyObject(PyObject* name, PyObject* value);
+
+// Variant of RegisterPyObject that requires the object's value to be a type.
 PyObject* RegisterType(PyObject* type_name, PyObject* type);
 
 }  // namespace swig
+
+// Returns a borrowed reference to an object that was registered with
+// RegisterPyObject.  (Do not call PY_DECREF on the result).
+PyObject* GetRegisteredPyObject(const std::string& name);
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_PYTHON_UTIL_UTIL_H_
diff --git a/tensorflow/python/util/util_wrapper.cc b/tensorflow/python/util/util_wrapper.cc
index dd74306413c..63c70d785cc 100644
--- a/tensorflow/python/util/util_wrapper.cc
+++ b/tensorflow/python/util/util_wrapper.cc
@@ -30,6 +30,10 @@ PYBIND11_MODULE(_pywrap_utils, m) {
           return tensorflow::PyoOrThrow(
               tensorflow::swig::RegisterType(type_name.ptr(), type.ptr()));
         });
+  m.def("RegisterPyObject", [](const py::handle& name, const py::handle& type) {
+    return tensorflow::PyoOrThrow(
+        tensorflow::swig::RegisterPyObject(name.ptr(), type.ptr()));
+  });
   m.def(
       "IsTensor",
       [](const py::handle& o) {
diff --git a/tensorflow/security/README.md b/tensorflow/security/README.md
index d9fa1c77a02..34f98e640d6 100644
--- a/tensorflow/security/README.md
+++ b/tensorflow/security/README.md
@@ -1,5 +1,7 @@
 # TensorFlow Security Advisories
 
+[![Fuzzing Status](https://oss-fuzz-build-logs.storage.googleapis.com/badges/tensorflow.svg)](https://bugs.chromium.org/p/oss-fuzz/issues/list?sort=-opened&can=1&q=proj:tensorflow)
+
 We regularly publish security advisories about using TensorFlow.
 
 *Note*: In conjunction with these security advisories, we strongly encourage
diff --git a/tensorflow/security/fuzzing/BUILD b/tensorflow/security/fuzzing/BUILD
index 9b5aeec2d36..93c03286b6c 100644
--- a/tensorflow/security/fuzzing/BUILD
+++ b/tensorflow/security/fuzzing/BUILD
@@ -19,16 +19,10 @@ tf_fuzz_target(
     ],
 )
 
-# A trivial fuzzer with no pre-specified corpus.
-# TODO(mihaimaruseac): Move fuzz_session and the op fuzzers to a subdirectory
 tf_fuzz_target(
-    name = "identity_fuzz",
-    srcs = ["identity_fuzz.cc"],
+    name = "status_group_fuzz",
+    srcs = ["status_group_fuzz.cc"],
     deps = [
-        "//tensorflow/cc:cc_ops",
-        "//tensorflow/core/kernels/fuzzing:fuzz_session",
-        # Needed only to transitiviely link dependencies
-        "//tensorflow/cc:scope",
-        "//tensorflow/core:core_cpu",
+        "//tensorflow/core/platform:status",
     ],
 )
diff --git a/tensorflow/security/fuzzing/op_fuzzing/BUILD b/tensorflow/security/fuzzing/op_fuzzing/BUILD
new file mode 100644
index 00000000000..aacd2f16cc4
--- /dev/null
+++ b/tensorflow/security/fuzzing/op_fuzzing/BUILD
@@ -0,0 +1,39 @@
+# Fuzzing TensorFlow ops..
+# Most ops have a similar set of dependencies and a similar fuzzing
+# infrastructure. Hence, we gather everything in one single place.
+# Note that these fuzzers cover a large part of TF, they are not granular.
+
+load(
+    "//tensorflow/security/fuzzing:tf_fuzzing.bzl",
+    "tf_fuzz_target",
+)
+
+package(
+    licenses = ["notice"],  # Apache 2.0
+)
+
+# Since all ops need to have a graph created before being fuzzed, we define
+# this header-only library to handle the needed plumbing.
+cc_library(
+    name = "fuzz_session",
+    hdrs = ["fuzz_session.h"],
+    deps = [
+        "//tensorflow/cc:scope",
+        "//tensorflow/core:core_cpu_base",
+        "//tensorflow/core:session_options",
+        "//tensorflow/core/common_runtime:direct_session_internal",
+        "//tensorflow/core/framework:tensor",
+        "//tensorflow/core/platform:status",
+    ],
+)
+
+# A trivial fuzzer with no pre-specified corpus.
+tf_fuzz_target(
+    name = "identity_fuzz",
+    srcs = ["identity_fuzz.cc"],
+    deps = [
+        ":fuzz_session",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/core/kernels:array",
+    ],
+)
diff --git a/tensorflow/security/fuzzing/op_fuzzing/fuzz_session.h b/tensorflow/security/fuzzing/op_fuzzing/fuzz_session.h
new file mode 100644
index 00000000000..575212b3b86
--- /dev/null
+++ b/tensorflow/security/fuzzing/op_fuzzing/fuzz_session.h
@@ -0,0 +1,156 @@
+/* Copyright 2016 Google Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_SECURITY_FUZZING_OP_FUZZING_FUZZ_SESSION_H_
+#define TENSORFLOW_SECURITY_FUZZING_OP_FUZZING_FUZZ_SESSION_H_
+
+#include <cstdint>
+#include <cstdlib>
+#include <string>
+#include <vector>
+
+#include "tensorflow/cc/framework/scope.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/public/session.h"
+#include "tensorflow/core/public/session_options.h"
+
+// Standard invoking function macro to dispatch to a fuzzer class.
+#define STANDARD_TF_FUZZ_FUNCTION(FuzzerClass)                              \
+  extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) { \
+    static FuzzerClass* fuzzer = new FuzzerClass();                         \
+    return fuzzer->Fuzz(data, size);                                        \
+  }
+
+// Standard builder for hooking one placeholder to one op.
+#define SINGLE_INPUT_OP_BUILDER(dtype, opName)                          \
+  void BuildGraph(const Scope& scope) override {                        \
+    auto op_node =                                                      \
+        tensorflow::ops::Placeholder(scope.WithOpName("input"), dtype); \
+    (void)tensorflow::ops::opName(scope.WithOpName("output"), op_node); \
+  }
+
+namespace tensorflow {
+namespace fuzzing {
+
+// Create a TensorFlow session using a specific GraphDef created
+// by BuildGraph(), and make it available for fuzzing.
+// Users must override BuildGraph and FuzzImpl to specify
+// (1) which operations are being fuzzed; and
+// (2) How to translate the uint8_t* buffer from the fuzzer
+//     to a Tensor or Tensors that are semantically appropriate
+//     for the op under test.
+// For the simple cases of testing a single op that takes a single
+// input Tensor, use the SINGLE_INPUT_OP_BUILDER(dtype, opName) macro in place
+// of defining BuildGraphDef.
+//
+// Typical use:
+// class FooFuzzer : public FuzzSession {
+//   SINGLE_INPUT_OP_BUILDER(DT_INT8, Identity);
+//   void FuzzImpl(const uint8_t* data, size_t size) {
+//      ... convert data and size to a Tensor, pass it to:
+//      RunInputs({{"input", input_tensor}});
+//
+class FuzzSession {
+ public:
+  FuzzSession() : initialized_(false) {}
+  virtual ~FuzzSession() {}
+
+  // Constructs a Graph using the supplied Scope.
+  // By convention, the graph should have inputs named "input1", ...
+  // "inputN", and one output node, named "output".
+  // Users of FuzzSession should override this method to create their graph.
+  virtual void BuildGraph(const Scope& scope) = 0;
+
+  // Implements the logic that converts an opaque byte buffer
+  // from the fuzzer to Tensor inputs to the graph.  Users must override.
+  virtual void FuzzImpl(const uint8_t* data, size_t size) = 0;
+
+  // Initializes the FuzzSession.  Not safe for multithreading.
+  // Separate init function because the call to virtual BuildGraphDef
+  // can't be put into the constructor.
+  Status InitIfNeeded() {
+    if (initialized_) {
+      return Status::OK();
+    }
+    initialized_ = true;
+
+    Scope root = Scope::DisabledShapeInferenceScope().ExitOnError();
+    SessionOptions options;
+    session_ = std::unique_ptr<Session>(NewSession(options));
+
+    BuildGraph(root);
+
+    GraphDef graph_def;
+    TF_CHECK_OK(root.ToGraphDef(&graph_def));
+
+    Status status = session_->Create(graph_def);
+    if (!status.ok()) {
+      // This is FATAL, because this code is designed to fuzz an op
+      // within a session.  Failure to create the session means we
+      // can't send any data to the op.
+      LOG(FATAL) << "Could not create session: " << status.error_message();
+    }
+    return status;
+  }
+
+  // Runs the TF session by pulling on the "output" node, attaching
+  // the supplied input_tensor to the input node(s), and discarding
+  // any returned output.
+  // Note: We are ignoring Status from Run here since fuzzers don't need to
+  // check it (as that will slow them down and printing/logging is useless).
+  void RunInputs(const std::vector<std::pair<string, Tensor> >& inputs) {
+    RunInputsWithStatus(inputs).IgnoreError();
+  }
+
+  // Same as RunInputs but don't ignore status
+  Status RunInputsWithStatus(
+      const std::vector<std::pair<string, Tensor> >& inputs) {
+    return session_->Run(inputs, {}, {"output"}, nullptr);
+  }
+
+  // Dispatches to FuzzImpl;  small amount of sugar to keep the code
+  // of the per-op fuzzers tiny.
+  int Fuzz(const uint8_t* data, size_t size) {
+    Status status = InitIfNeeded();
+    TF_CHECK_OK(status) << "Fuzzer graph initialization failed: "
+                        << status.error_message();
+    // No return value from fuzzing:  Success is defined as "did not
+    // crash".  The actual application results are irrelevant.
+    FuzzImpl(data, size);
+    return 0;
+  }
+
+ private:
+  bool initialized_;
+  std::unique_ptr<Session> session_;
+};
+
+// A specialized fuzz implementation for ops that take
+// a single string.  Caller must still define the op
+// to plumb by overriding BuildGraph or using
+// a plumbing macro.
+class FuzzStringInputOp : public FuzzSession {
+  void FuzzImpl(const uint8_t* data, size_t size) final {
+    Tensor input_tensor(tensorflow::DT_STRING, TensorShape({}));
+    input_tensor.scalar<tstring>()() =
+        string(reinterpret_cast<const char*>(data), size);
+    RunInputs({{"input", input_tensor}});
+  }
+};
+
+}  // end namespace fuzzing
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_SECURITY_FUZZING_OP_FUZZING_FUZZ_SESSION_H_
diff --git a/tensorflow/security/fuzzing/identity_fuzz.cc b/tensorflow/security/fuzzing/op_fuzzing/identity_fuzz.cc
similarity index 95%
rename from tensorflow/security/fuzzing/identity_fuzz.cc
rename to tensorflow/security/fuzzing/op_fuzzing/identity_fuzz.cc
index 4c1049d381b..a63c35b45e2 100644
--- a/tensorflow/security/fuzzing/identity_fuzz.cc
+++ b/tensorflow/security/fuzzing/op_fuzzing/identity_fuzz.cc
@@ -14,7 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/cc/ops/standard_ops.h"
-#include "tensorflow/core/kernels/fuzzing/fuzz_session.h"
+#include "tensorflow/security/fuzzing/op_fuzzing/fuzz_session.h"
 
 namespace tensorflow {
 namespace fuzzing {
diff --git a/tensorflow/security/fuzzing/status_fuzz.cc b/tensorflow/security/fuzzing/status_fuzz.cc
index 7b161645148..d8bb23fe0b2 100644
--- a/tensorflow/security/fuzzing/status_fuzz.cc
+++ b/tensorflow/security/fuzzing/status_fuzz.cc
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <fuzzer/FuzzedDataProvider.h>
+
 #include <cstdint>
 #include <cstdlib>
 
@@ -26,10 +28,7 @@ limitations under the License.
 
 namespace {
 
-tensorflow::error::Code BuildRandomErrorCode(uint8_t a, uint8_t b, uint8_t c,
-                                             uint8_t d) {
-  int code = (a << 24) | (b << 16) | (c << 8) | d;
-
+tensorflow::error::Code BuildRandomErrorCode(uint32_t code) {
   // We cannot build a `Status` with error_code of 0 and a message, so force
   // error code to be non-zero.
   if (code == 0) {
@@ -39,22 +38,16 @@ tensorflow::error::Code BuildRandomErrorCode(uint8_t a, uint8_t b, uint8_t c,
   return static_cast<tensorflow::error::Code>(code);
 }
 
-std::string GetRandomErrorString(const uint8_t *data, size_t size) {
-  const char *p = reinterpret_cast<const char *>(data);
-  return std::string(p, size);
-}
-
 extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
-  // TODO(mihaimaruseac): Use `FuzzedDataProvider` and then make these `const`
   tensorflow::error::Code error_code;
   std::string error_message;
-  if (size < 4) {
-    error_code = BuildRandomErrorCode(0, 0, 0, 0);
-    error_message = GetRandomErrorString(data, size);
-  } else {
-    error_code = BuildRandomErrorCode(data[0], data[1], data[2], data[3]);
-    error_message = GetRandomErrorString(data + 4, size - 4);
-  }
+
+  FuzzedDataProvider fuzzed_data(data, size);
+
+  uint32_t code = fuzzed_data.ConsumeIntegral<uint32_t>();
+  error_code = BuildRandomErrorCode(code);
+
+  error_message = fuzzed_data.ConsumeRemainingBytesAsString();
 
   tensorflow::Status s = tensorflow::Status(error_code, error_message);
   const std::string actual_message = s.ToString();
diff --git a/tensorflow/security/fuzzing/status_group_fuzz.cc b/tensorflow/security/fuzzing/status_group_fuzz.cc
new file mode 100644
index 00000000000..a3db42a5e2e
--- /dev/null
+++ b/tensorflow/security/fuzzing/status_group_fuzz.cc
@@ -0,0 +1,66 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <fuzzer/FuzzedDataProvider.h>
+
+#include <cstdint>
+#include <cstdlib>
+
+#include "tensorflow/core/platform/status.h"
+
+// This is a fuzzer for `tensorflow::StatusGroup`. Since `Status` is used almost
+// everywhere, we need to ensure that the common functionality is safe. We don't
+// expect many crashes from this fuzzer
+
+namespace {
+
+tensorflow::error::Code BuildRandomErrorCode(uint32_t code) {
+  // We cannot build a `Status` with error_code of 0 and a message, so force
+  // error code to be non-zero.
+  if (code == 0) {
+    return tensorflow::error::UNKNOWN;
+  }
+
+  return static_cast<tensorflow::error::Code>(code);
+}
+
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
+  const std::string error_message = "ERROR";
+  tensorflow::StatusGroup sg;
+  FuzzedDataProvider fuzzed_data(data, size);
+
+  while (fuzzed_data.remaining_bytes() > 0) {
+    uint32_t code = fuzzed_data.ConsumeIntegral<uint32_t>();
+    tensorflow::error::Code error_code = BuildRandomErrorCode(code);
+    bool is_derived = fuzzed_data.ConsumeBool();
+
+    tensorflow::Status s = tensorflow::Status(error_code, error_message);
+
+    if (is_derived) {
+      tensorflow::Status derived_s = tensorflow::StatusGroup::MakeDerived(s);
+      sg.Update(derived_s);
+    } else {
+      sg.Update(s);
+    }
+  }
+
+  // Ignore warnings that these values are unused
+  sg.as_summary_status().IgnoreError();
+  sg.as_concatenated_status().IgnoreError();
+  sg.AttachLogMessages();
+
+  return 0;
+}
+
+}  // namespace
diff --git a/tensorflow/stream_executor/cuda/BUILD b/tensorflow/stream_executor/cuda/BUILD
index c3cf9f5db15..3a14be9ad50 100644
--- a/tensorflow/stream_executor/cuda/BUILD
+++ b/tensorflow/stream_executor/cuda/BUILD
@@ -251,6 +251,7 @@ cc_library(
         "@local_config_cuda//cuda:cuda_headers",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "//tensorflow/core/platform:tf32_utils",
         "//tensorflow/stream_executor",
         "//tensorflow/stream_executor:event",
         "//tensorflow/stream_executor:host_or_device_scalar",
@@ -356,6 +357,7 @@ cc_library(
         "@local_config_cuda//cuda:cudnn_header",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "//tensorflow/core/platform:tf32_utils",
         "//tensorflow/stream_executor:dnn",
         "//tensorflow/stream_executor:event",
         "//tensorflow/stream_executor:plugin_registry",
diff --git a/tensorflow/stream_executor/cuda/cublas_stub.cc b/tensorflow/stream_executor/cuda/cublas_stub.cc
index 1cbfd51316c..76f3d9b134e 100644
--- a/tensorflow/stream_executor/cuda/cublas_stub.cc
+++ b/tensorflow/stream_executor/cuda/cublas_stub.cc
@@ -63,14 +63,12 @@ typedef enum {} cublasMath_t;
 
 #if CUDA_VERSION < 10000
 #include "tensorflow/stream_executor/cuda/cublas_9_0.inc"
-#elif CUDA_VERSION == 10000
+#elif CUDA_VERSION < 10010
 #include "tensorflow/stream_executor/cuda/cublas_10_0.inc"
-#elif CUDA_VERSION == 10010
+#elif CUDA_VERSION < 10020
 #include "tensorflow/stream_executor/cuda/cublas_10_1.inc"
-#elif CUDA_VERSION == 10020
+#elif CUDA_VERSION < 11000
 #include "tensorflow/stream_executor/cuda/cublas_10_2.inc"
-#elif CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 0
-#include "tensorflow/stream_executor/cuda/cublas_11_0.inc"
 #else
-#error "We have no wrapper for this version."
+#include "tensorflow/stream_executor/cuda/cublas_11_0.inc"
 #endif
diff --git a/tensorflow/stream_executor/cuda/cuda_blas.cc b/tensorflow/stream_executor/cuda/cuda_blas.cc
index c9f0fc462c9..f32c8b3e81e 100644
--- a/tensorflow/stream_executor/cuda/cuda_blas.cc
+++ b/tensorflow/stream_executor/cuda/cuda_blas.cc
@@ -49,6 +49,7 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "third_party/eigen3/Eigen/Core"
+#include "tensorflow/core/platform/tf32_utils.h"
 #include "tensorflow/core/util/env_var.h"
 #include "tensorflow/stream_executor/cuda/cuda_activation.h"
 #include "tensorflow/stream_executor/cuda/cuda_gpu_executor.h"
@@ -101,18 +102,6 @@ static std::string ToString(cublasStatus_t status) {
   }
 }
 
-// Decide whether to enable TENSOR_OP_MATH
-static bool TensorOpMathEnabled() {
-  static bool is_enabled = [] {
-    bool is_disabled;
-    TF_CHECK_OK(
-        tensorflow::ReadBoolFromEnvVar("TF_DISABLE_CUBLAS_TENSOR_OP_MATH",
-                                       /*default_val=*/false, &is_disabled));
-    return !is_disabled;
-  }();
-  return is_enabled;
-}
-
 // cuBLAS has interfaces that permit pointers to be passed from either the host
 // memory space or the device memory space; however, you must instruct it as to
 // which address space those pointers are in with cublasSetPointerMode.
@@ -399,7 +388,7 @@ cudaDataType_t CUDAComputationType(blas::ComputationType ty) {
 template <typename FuncT, typename... Args>
 bool CUDABlas::DoBlasInternalImpl(FuncT cublas_func, Stream *stream,
                                   bool pointer_mode_host, bool err_on_failure,
-                                  bool use_tensor_op_math, Args... args) {
+                                  cublasMath_t math_type, Args... args) {
   absl::MutexLock lock(&mu_);
 
   CHECK(blas_ != nullptr);
@@ -407,20 +396,26 @@ bool CUDABlas::DoBlasInternalImpl(FuncT cublas_func, Stream *stream,
     return false;
   }
 
+#if CUDA_VERSION >= 9000
+  ScopedCublasMathMode math_mode{blas_};
+#if CUBLAS_VER_MAJOR >= 11
+  if (math_type == CUBLAS_TF32_TENSOR_OP_MATH &&
+      tensorflow::tf32_execution_allowed()) {
+#else
+  if (math_type == CUBLAS_TENSOR_OP_MATH) {
+#endif
+    if (!math_mode.Init(math_type)) {
+      return false;
+    }
+  }
+#endif
+
   gpu::ScopedActivateExecutorContext sac{parent_};
   ScopedCublasPointerMode pointer_mode{blas_};
   if (!pointer_mode.Init(pointer_mode_host ? CUBLAS_POINTER_MODE_HOST
                                            : CUBLAS_POINTER_MODE_DEVICE)) {
     return false;
   }
-#if CUDA_VERSION >= 9000
-  ScopedCublasMathMode math_mode{blas_};
-  if (use_tensor_op_math) {
-    if (!math_mode.Init(CUBLAS_TENSOR_OP_MATH)) {
-      return false;
-    }
-  }
-#endif
   cublasStatus_t ret = cublas_func(blas_, args...);
   if ((err_on_failure || VLOG_IS_ON(3)) && ret != CUBLAS_STATUS_SUCCESS) {
     LOG(ERROR) << "failed to run cuBLAS routine: " << ToString(ret);
@@ -1633,21 +1628,15 @@ bool CUDABlas::DoBlasGemm(
     }
   }
 
-  bool use_tensor_ops = false;
-#if CUDA_VERSION >= 9000
-  int cc_major, cc_minor;
-  stream->parent()->GetDeviceDescription().cuda_compute_capability(&cc_major,
-                                                                   &cc_minor);
-
-  // GPUs < sm_70 don't support tensor ops.
-  if (cc_major >= 7 && TensorOpMathEnabled()) {
-    use_tensor_ops = true;
-  }
+#if CUDA_VERSION < 11000
+  cublasMath_t math_type = CUBLAS_TENSOR_OP_MATH;
+#else
+  cublasMath_t math_type = CUBLAS_DEFAULT_MATH;
 #endif
 
   return DoBlasInternalImpl(
       cublasSgemmEx, stream, true /* = pointer_mode_host */,
-      true /* = err_on_failure= */, use_tensor_ops, CUDABlasTranspose(transa),
+      true /* = err_on_failure= */, math_type, CUDABlasTranspose(transa),
       CUDABlasTranspose(transb), m, n, k, &alpha, GpuMemory(a),
       SE_CUDA_DATA_HALF, lda, GpuMemory(b), SE_CUDA_DATA_HALF, ldb, &beta,
       GpuMemoryMutable(c), SE_CUDA_DATA_HALF, ldc);
@@ -1692,10 +1681,18 @@ bool CUDABlas::DoBlasGemm(Stream *stream, blas::Transpose transa,
                       "precondition violation";
     }
   }
-  return DoBlasInternal(cublasSgemm, stream, true /* = pointer_mode_host */,
-                        CUDABlasTranspose(transa), CUDABlasTranspose(transb), m,
-                        n, k, &alpha, GpuMemory(a), lda, GpuMemory(b), ldb,
-                        &beta, GpuMemoryMutable(c), ldc);
+
+#if CUDA_VERSION < 11000
+  cublasMath_t math_type = CUBLAS_DEFAULT_MATH;
+#else
+  cublasMath_t math_type = CUBLAS_TF32_TENSOR_OP_MATH;
+#endif
+
+  return DoBlasInternalImpl(
+      cublasSgemm, stream, true /* = pointer_mode_host */,
+      true /* = err_on_failure */, math_type, CUDABlasTranspose(transa),
+      CUDABlasTranspose(transb), m, n, k, &alpha, GpuMemory(a), lda,
+      GpuMemory(b), ldb, &beta, GpuMemoryMutable(c), ldc);
 }
 
 bool CUDABlas::DoBlasGemm(Stream *stream, blas::Transpose transa,
@@ -1914,21 +1911,6 @@ static bool UsesTensorOps(blas::AlgorithmType algo) {
 #endif
 }
 
-template <typename InType>
-static bool TensorOpsAvailable(int cc_major) {
-#if CUDA_VERSION >= 9000
-  // cublas *does* allow tensor ops on inputs that are not fp16, so this is not
-  // strictly correct.  We can't simply enable it, though, as that would change
-  // clients' behavior significantly: Using tensor ops on fp32 inputs cause them
-  // to be rounded to fp16.
-  if (cc_major >= 7 && TensorOpMathEnabled() &&
-      std::is_same<InType, Eigen::half>::value) {
-    return true;
-  }
-#endif
-  return false;
-}
-
 template <typename InT, typename OutT, typename CompT>
 bool CUDABlas::DoBlasGemmWithAlgorithmImpl(
     Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
@@ -1947,18 +1929,48 @@ bool CUDABlas::DoBlasGemmWithAlgorithmImpl(
     return false;
   }
 
-  if (UsesTensorOps(algorithm) && !TensorOpsAvailable<InT>(cc_major)) {
-    if (std::is_same<InT, Eigen::half>::value) {
+  bool algo_uses_tensor_ops = UsesTensorOps(algorithm);
+  cublasMath_t math_type = CUBLAS_DEFAULT_MATH;
+  if (algo_uses_tensor_ops) {
+    if (cc_major < 7) {
       VLOG(2) << "DoBlasGemmWithAlgorithm returning false because algorithm "
               << algorithm
               << " uses tensor ops, but tensor ops are not available in sm"
               << cc_major << "X devices.";
+      return false;
+    } else if (std::is_same<InT, float>::value) {
+#if CUDA_VERSION < 11000
+      VLOG(2) << "DoBlasGemmWithAlgorithm returning false because algorithm "
+              << algorithm
+              << " uses tensor ops, but tensor ops are not available for fp32"
+              << " inputs.";
+      return false;
+#else
+      if (cc_major < 8) {
+        VLOG(2) << "DoBlasGemmWithAlgorithm returning false because algorithm "
+                << algorithm
+                << " uses tensor ops, but tensor ops are not available in sm"
+                << cc_major << "X devices for float input types.";
+        return false;
+      } else if (!tensorflow::tf32_execution_allowed()) {
+        VLOG(2) << "DoBlasGemmWithAlgorithm returning false because algorithm "
+                << algorithm
+                << " uses tensor ops, but tensor ops are disabled for fp32"
+                << " inputs.";
+        return false;
+      }
+      math_type = CUBLAS_TF32_TENSOR_OP_MATH;
+#endif
+    } else if (std::is_same<InT, Eigen::half>::value) {
+#if CUDA_VERSION < 11000
+      math_type = CUBLAS_TENSOR_OP_MATH;
+#endif
     } else {
       VLOG(2) << "DoBlasGemmWithAlgorithm returning false because algorithm "
               << algorithm
-              << " uses tensor ops, but the input data type is not fp16.";
+              << " uses tensor ops, which are not supported for InT.";
+      return false;
     }
-    return false;
   }
 
   // Either both 'alpha' and 'beta' need to be pointers to device memory, or
@@ -1998,10 +2010,10 @@ bool CUDABlas::DoBlasGemmWithAlgorithmImpl(
   // If 'alpha' and 'beta' are host scalars and CompT is Eigen::half, we
   // essentially reinterpet_cast to __half, which is safe because Eigen::half
   // inherits from __half.
-  bool result = DoBlasInternalFailureOK(
+  bool result = DoBlasInternalImpl(
       AS_LAMBDA(cublasGemmEx), stream,
-      /* pointer_mode_host = */ !alpha.is_pointer(), CUDABlasTranspose(transa),
-      CUDABlasTranspose(transb), m, n, k,
+      /* pointer_mode_host = */ !alpha.is_pointer(), /*err_on_failure=*/false,
+      math_type, CUDABlasTranspose(transa), CUDABlasTranspose(transb), m, n, k,
       alpha.is_pointer() ? GpuMemory(alpha.pointer()) : &alpha.value(),
       GpuMemory(a), cuda_in_type, lda, GpuMemory(b), cuda_in_type, ldb,
       beta.is_pointer() ? GpuMemory(beta.pointer()) : &beta.value(),
@@ -2270,9 +2282,27 @@ port::Status CUDABlas::DoBlasGemmBatchedInternal(
   if (stream->parent()->GetDeviceDescription().cuda_compute_capability(
           &cc_major, &cc_minor) &&
       cc_major >= 5) {
-    bool use_tensor_ops = TensorOpMathEnabled() && data_type == CUDA_R_16F;
-    cublasGemmAlgo_t algo =
-        (use_tensor_ops ? CUBLAS_GEMM_DFALT_TENSOR_OP : CUBLAS_GEMM_DFALT);
+    cublasMath_t math_type;
+    cublasGemmAlgo_t algo;
+    if (data_type == CUDA_R_16F) {
+#if CUDA_VERSION < 11000
+      math_type = CUBLAS_TENSOR_OP_MATH;
+#else
+      math_type = CUBLAS_DEFAULT_MATH;
+#endif
+      algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
+#if CUBLAS_VER_MAJOR >= 11
+    } else if (data_type == CUDA_R_32F) {
+      // DoBlassInternalImpl will switch math_type back to CUBLAS_DEFAULT_MATH
+      // if TF32 is disabled.
+      math_type = CUBLAS_TF32_TENSOR_OP_MATH;
+      algo = tensorflow::tf32_execution_allowed() ? CUBLAS_GEMM_DFALT_TENSOR_OP
+                                                  : CUBLAS_GEMM_DFALT;
+#endif
+    } else {
+      math_type = CUBLAS_DEFAULT_MATH;
+      algo = CUBLAS_GEMM_DFALT;
+    }
     cudaDataType_t compute_type =
         (data_type == CUDA_R_16F ? CUDA_R_32F : data_type);
     const void **a_void_ptrs = reinterpret_cast<const void **>(
@@ -2284,7 +2314,7 @@ port::Status CUDABlas::DoBlasGemmBatchedInternal(
     bool ok;
     ok = DoBlasInternalImpl(
         AS_LAMBDA(cublasGemmBatchedEx), stream, true /* = pointer_mode_host */,
-        true /* = err_on_failure */, use_tensor_ops, CUDABlasTranspose(transa),
+        true /* = err_on_failure */, math_type, CUDABlasTranspose(transa),
         CUDABlasTranspose(transb), m, n, k, &alpha, a_void_ptrs, data_type, lda,
         b_void_ptrs, data_type, ldb, &beta, c_void_ptrs, data_type, ldc,
         batch_count, compute_type, algo);
@@ -2419,33 +2449,30 @@ bool CUDABlas::DoBlasGemmStridedBatched(
     int lda, int64 stride_a, const DeviceMemory<Eigen::half> &b, int ldb,
     int64 stride_b, float beta, DeviceMemory<Eigen::half> *c, int ldc,
     int64 stride_c, int batch_count) {
-  bool use_tensor_ops = false;
-#if CUDA_VERSION >= 9000
+#if CUDA_VERSION >= 9010
   int cc_major, cc_minor;
   if (stream->parent()->GetDeviceDescription().cuda_compute_capability(
-          &cc_major, &cc_minor)) {
-    // GPUs < sm_70 don't support tensor ops.
-    if (cc_major >= 7 && TensorOpMathEnabled()) {
-      use_tensor_ops = true;
-    }
-#if CUDA_VERSION >= 9010
-    if (cc_major >= 5) {
-      cublasGemmAlgo_t algo =
-          (use_tensor_ops ? CUBLAS_GEMM_DFALT_TENSOR_OP : CUBLAS_GEMM_DFALT);
-      bool ok = DoBlasInternalImpl(
-          AS_LAMBDA(cublasGemmStridedBatchedEx), stream,
-          true /* = pointer_mode_host */, true /* = err_on_failure */,
-          use_tensor_ops, CUDABlasTranspose(transa), CUDABlasTranspose(transb),
-          m, n, k, &alpha, GpuMemory(a), CUDA_R_16F, lda, stride_a,
-          GpuMemory(b), CUDA_R_16F, ldb, stride_b, &beta, GpuMemoryMutable(c),
-          CUDA_R_16F, ldc, stride_c, batch_count, CUDA_R_32F, algo);
-      if (ok) {
-        return true;
-      }
-      LOG(ERROR) << "failed BLAS call, see log for details";
-      return false;
-    }
+          &cc_major, &cc_minor) &&
+      cc_major >= 5) {
+    cublasGemmAlgo_t algo =
+        (cc_major >= 7 ? CUBLAS_GEMM_DFALT_TENSOR_OP : CUBLAS_GEMM_DFALT);
+#if CUDA_VERSION < 11000
+    cublasMath_t math_type = CUBLAS_TENSOR_OP_MATH;
+#else
+    cublasMath_t math_type = CUBLAS_DEFAULT_MATH;
 #endif
+    bool ok = DoBlasInternalImpl(
+        AS_LAMBDA(cublasGemmStridedBatchedEx), stream,
+        true /* = pointer_mode_host */, true /* = err_on_failure */, math_type,
+        CUDABlasTranspose(transa), CUDABlasTranspose(transb), m, n, k, &alpha,
+        GpuMemory(a), CUDA_R_16F, lda, stride_a, GpuMemory(b), CUDA_R_16F, ldb,
+        stride_b, &beta, GpuMemoryMutable(c), CUDA_R_16F, ldc, stride_c,
+        batch_count, CUDA_R_32F, algo);
+    if (ok) {
+      return true;
+    }
+    LOG(ERROR) << "failed BLAS call, see log for details";
+    return false;
   }
 #endif
   // Either CUDA_VERSION < 9.1 or SM < 5.0. Fall back to a loop.
@@ -2458,10 +2485,10 @@ bool CUDABlas::DoBlasGemmStridedBatched(
         reinterpret_cast<__half *>(GpuMemoryMutable(c) + batch * stride_c);
     bool ok = DoBlasInternalImpl(
         cublasSgemmEx, stream, true /* = pointer_mode_host */,
-        true /* = err_on_failure= */, use_tensor_ops, CUDABlasTranspose(transa),
-        CUDABlasTranspose(transb), m, n, k, &alpha, a_matrix, SE_CUDA_DATA_HALF,
-        lda, b_matrix, SE_CUDA_DATA_HALF, ldb, &beta, c_matrix,
-        SE_CUDA_DATA_HALF, ldc);
+        true /* = err_on_failure= */, CUBLAS_DEFAULT_MATH,
+        CUDABlasTranspose(transa), CUDABlasTranspose(transb), m, n, k, &alpha,
+        a_matrix, SE_CUDA_DATA_HALF, lda, b_matrix, SE_CUDA_DATA_HALF, ldb,
+        &beta, c_matrix, SE_CUDA_DATA_HALF, ldc);
     if (!ok) {
       LOG(ERROR) << "failed BLAS call, see log for details";
       return false;
@@ -2476,11 +2503,17 @@ bool CUDABlas::DoBlasGemmStridedBatched(
     int64 stride_a, const DeviceMemory<float> &b, int ldb, int64 stride_b,
     float beta, DeviceMemory<float> *c, int ldc, int64 stride_c,
     int batch_count) {
-  return DoBlasInternal(
+#if CUDA_VERSION < 11000
+  cublasMath_t math_type = CUBLAS_DEFAULT_MATH;
+#else
+  cublasMath_t math_type = CUBLAS_TF32_TENSOR_OP_MATH;
+#endif
+  return DoBlasInternalImpl(
       cublasSgemmStridedBatched, stream, true /* = pointer_mode_host */,
-      CUDABlasTranspose(transa), CUDABlasTranspose(transb), m, n, k, &alpha,
-      GpuMemory(a), lda, stride_a, GpuMemory(b), ldb, stride_b, &beta,
-      GpuMemoryMutable(c), ldc, stride_c, batch_count);
+      true /* = err_on_failure */, math_type, CUDABlasTranspose(transa),
+      CUDABlasTranspose(transb), m, n, k, &alpha, GpuMemory(a), lda, stride_a,
+      GpuMemory(b), ldb, stride_b, &beta, GpuMemoryMutable(c), ldc, stride_c,
+      batch_count);
 }
 
 bool CUDABlas::DoBlasGemmStridedBatched(
diff --git a/tensorflow/stream_executor/cuda/cuda_blas.h b/tensorflow/stream_executor/cuda/cuda_blas.h
index 817bdb72777..9ff63102aaa 100644
--- a/tensorflow/stream_executor/cuda/cuda_blas.h
+++ b/tensorflow/stream_executor/cuda/cuda_blas.h
@@ -21,6 +21,7 @@ limitations under the License.
 #define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_BLAS_H_
 
 #include "absl/synchronization/mutex.h"
+#include "third_party/gpus/cuda/include/cublas_v2.h"
 #include "tensorflow/core/platform/thread_annotations.h"
 #include "tensorflow/stream_executor/blas.h"
 #include "tensorflow/stream_executor/host_or_device_scalar.h"
@@ -83,26 +84,17 @@ class CUDABlas : public blas::BlasSupport {
   template <typename FuncT, typename... Args>
   bool DoBlasInternalImpl(FuncT cublas_func, Stream *stream,
                           bool pointer_mode_host, bool err_on_failure,
-                          bool use_tensor_op_math, Args... args);
+                          cublasMath_t math_type, Args... args);
 
-  // Convenience functions that call DoBlasInternalImpl with different values
-  // for err_on_failure.
+  // Convenience functions that call DoBlasInternalImpl with err_on_failure=true
+  // and math_type=CUBLAS_DEFAULT_MATH.
   template <typename FuncT, typename... Args>
   bool DoBlasInternal(FuncT cublas_func, Stream *stream, bool pointer_mode_host,
                       Args... args) {
     return DoBlasInternalImpl(cublas_func, stream, pointer_mode_host,
-                              /*err_on_failure=*/true, /*use_tensor_ops=*/false,
+                              /*err_on_failure=*/true, CUBLAS_DEFAULT_MATH,
                               args...);
   }
-  template <typename FuncT, typename... Args>
-  bool DoBlasInternalFailureOK(FuncT cublas_func, Stream *stream,
-                               bool pointer_mode_host, Args... args) {
-    // Tensor ops are hard-coded off in this path, but can still be enabled with
-    // a specific algorithm choice as in DoBlasGemmWithAlgorithmImpl().
-    return DoBlasInternalImpl(cublas_func, stream, pointer_mode_host,
-                              /*err_on_failure=*/false,
-                              /*use_tensor_ops=*/false, args...);
-  }
 
   // A helper function to implement DoBlasGemmBatched interfaces for generic
   // types.
diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.cc b/tensorflow/stream_executor/cuda/cuda_dnn.cc
index be18c989861..a97850bd8d5 100644
--- a/tensorflow/stream_executor/cuda/cuda_dnn.cc
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "third_party/eigen3/Eigen/Core"
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/tf32_utils.h"
 #include "tensorflow/core/util/env_var.h"
 #include "tensorflow/stream_executor/cuda/cuda_activation.h"
 #include "tensorflow/stream_executor/cuda/cuda_diagnostics.h"
@@ -601,31 +602,6 @@ class CudnnFilterDescriptor {
   SE_DISALLOW_COPY_AND_ASSIGN(CudnnFilterDescriptor);
 };
 
-// A helper function to decide whether to enable the TENSOR_OP_MATH math type
-bool TensorOpMathEnabled() {
-  static bool is_enabled = [] {
-    bool is_disabled = false;
-    TF_CHECK_OK(
-        tensorflow::ReadBoolFromEnvVar("TF_DISABLE_CUDNN_TENSOR_OP_MATH",
-                                       /*default_val=*/false, &is_disabled));
-    return !is_disabled;
-  }();
-  return is_enabled;
-}
-
-// A helper function to decide whether to enable the TENSOR_OP_MATH math type
-// for RNNs.
-bool RnnTensorOpMathEnabled() {
-  static bool is_enabled = [] {
-    bool is_disabled = false;
-    TF_CHECK_OK(
-        tensorflow::ReadBoolFromEnvVar("TF_DISABLE_CUDNN_RNN_TENSOR_OP_MATH",
-                                       /*default_val=*/false, &is_disabled));
-    return !is_disabled;
-  }();
-  return is_enabled;
-}
-
 // A helper function to decide whether to use
 // CUDNN_BATCHNORM_SPATIAL_PERSISTENT in batchnorm. This mode can be faster in
 // some tasks because an optimized path may be selected for CUDNN_DATA_FLOAT
@@ -730,10 +706,6 @@ class CudnnConvolutionDescriptor {
             : CUDNN_CROSS_CORRELATION,
         data_type));
 
-    // NOTE(benbarsdell): This only applies if tensor op math is enabled
-    //                      and algo selection is set to Default.
-    this->set_use_tensor_op_math(true);
-
 #if CUDNN_MAJOR >= 7
     VLOG(2) << "Requesting grouped convolution: "
             << convolution_descriptor.group_count();
@@ -745,13 +717,15 @@ class CudnnConvolutionDescriptor {
 #endif
   }
 
-  void set_use_tensor_op_math(bool use_tensor_op_math) const {
+  void set_use_tensor_op_math(bool use_tensor_op_math) {
 #if CUDNN_VERSION >= 7000
     cudnnMathType_t math_type =
+#if CUDNN_VERSION >= 8000
+        (use_tensor_op_math ? CUDNN_TENSOR_OP_MATH : CUDNN_FMA_MATH);
+#else
         (use_tensor_op_math ? CUDNN_TENSOR_OP_MATH : CUDNN_DEFAULT_MATH);
-    if (TensorOpMathEnabled()) {
-      CHECK_CUDNN_OK(cudnnSetConvolutionMathType(handle_.get(), math_type));
-    }
+#endif
+    CHECK_CUDNN_OK(cudnnSetConvolutionMathType(handle_.get(), math_type));
 #endif
   }
 
@@ -763,6 +737,40 @@ class CudnnConvolutionDescriptor {
   SE_DISALLOW_COPY_AND_ASSIGN(CudnnConvolutionDescriptor);
 };
 
+// A helper function to query if a CudnnConvolutionDescriptor has tensor_op_math
+// set
+static bool IsTensorMathOpSet(const CudnnConvolutionDescriptor& conv) {
+  cudnnMathType_t math_type;
+  CHECK_CUDNN_OK(cudnnGetConvolutionMathType(conv.handle(), &math_type));
+#if CUDNN_VERSION >= 8000
+  return math_type != CUDNN_FMA_MATH;
+#else
+  return math_type == CUDNN_TENSOR_OP_MATH;
+#endif
+}
+
+static bool TensorOpMathAvailable(int cc_major) {
+  return cc_major >= 7 && CUDNN_VERSION >= 7000;
+}
+
+static bool IsTensorMathAllowed(Stream* stream, dnn::DataType input_type) {
+  int cc_major, cc_minor;
+  std::tie(cc_major, cc_minor) = GetCcMajorMinor(stream);
+  if (!TensorOpMathAvailable(cc_major)) {
+    return false;
+  }
+  if (input_type == dnn::DataType::kFloat) {
+#if CUDNN_VERSION < 8000
+    return false;
+#else
+    if (!tensorflow::tf32_execution_allowed()) {
+      return false;
+    }
+#endif
+  }
+  return true;
+}
+
 // Turns a PoolingDescriptor structure into a cudnn pooling descriptor handle
 // within a scope.
 class CudnnPoolingDescriptor {
@@ -1155,21 +1163,27 @@ class CudnnRnnDescriptor : public dnn::RnnDescriptor {
     // in profile mode, which is run with algorithms returned from
     // GetRnnAlgorithms() (which are non-default and explicitly set whether to
     // use tensor ops). CuDNN 7.2.1 fixed this issue
-    if (RnnTensorOpMathEnabled()) {
-      cudnnMathType_t math_type;
-      if (algorithm_config.algorithm().has_value()) {
-        math_type = algorithm_config.algorithm()->tensor_ops_enabled()
-                        ? CUDNN_TENSOR_OP_MATH
-                        : CUDNN_DEFAULT_MATH;
-      } else {
-#if CUDNN_VERSION >= 7201
-        math_type = CUDNN_TENSOR_OP_MATH;
-#else
-        math_type = CUDNN_DEFAULT_MATH;
-#endif  // CUDNN_VERSION >= 7201
-      }
-      CHECK_CUDNN_OK(cudnnSetRNNMatrixMathType(rnn_desc.get(), math_type));
+    bool allow_tensor_ops =
+        data_type != CUDNN_DATA_FLOAT || tensorflow::tf32_execution_allowed();
+    bool use_tensor_ops;
+    if (algorithm_config.algorithm().has_value()) {
+      use_tensor_ops = algorithm_config.algorithm()->tensor_ops_enabled();
+    } else {
+      use_tensor_ops = CUDNN_VERSION >= 7201 && allow_tensor_ops;
     }
+
+    if (use_tensor_ops && !allow_tensor_ops) {
+      return port::Status(port::error::INVALID_ARGUMENT,
+                          "Algo requests disallowed tensor op evaluation.");
+    }
+
+    cudnnMathType_t math_type;
+#if CUDNN_VERSION >= 8000
+    math_type = use_tensor_ops ? CUDNN_TENSOR_OP_MATH : CUDNN_FMA_MATH;
+#else
+    math_type = use_tensor_ops ? CUDNN_TENSOR_OP_MATH : CUDNN_DEFAULT_MATH;
+#endif
+    CHECK_CUDNN_OK(cudnnSetRNNMatrixMathType(rnn_desc.get(), math_type));
 #endif  // CUDNN_VERSION >= 7000
 
     return CudnnRnnDescriptor(cudnn, std::move(rnn_desc), std::move(rnn_plan),
@@ -2560,10 +2574,11 @@ port::StatusOr<DeviceMemory<uint8>> AllocateCudnnConvolutionForwardWorkspace(
     const CudnnTensorDescriptor& output_nd,
     const dnn::AlgorithmDesc& algorithm_desc,
     ScratchAllocator* scratch_allocator) {
-  // TODO(csigg): This has side effects on the convolution descriptor. It is
-  // functionally correct because the convolution is run with the algorithm of
-  // the last call to this function, but should be fixed anyway.
-  conv.set_use_tensor_op_math(algorithm_desc.tensor_ops_enabled());
+  if (IsTensorMathOpSet(conv) != algorithm_desc.tensor_ops_enabled()) {
+    return port::Status(
+        port::error::INTERNAL,
+        "Mismatch between cudnn conv and algorithm descriptors.");
+  }
 
   // Query the size of the workspace and allocate it.
   size_t size_in_bytes;
@@ -2603,10 +2618,11 @@ AllocateCudnnConvolutionBackwardDataWorkspace(
     const CudnnTensorDescriptor& output_nd,
     const dnn::AlgorithmDesc& algorithm_desc,
     ScratchAllocator* scratch_allocator) {
-  // TODO(csigg): This has side effects on the convolution descriptor. It is
-  // functionally correct because the convolution is run with the algorithm of
-  // the last call to this function, but should be fixed anyway.
-  conv.set_use_tensor_op_math(algorithm_desc.tensor_ops_enabled());
+  if (IsTensorMathOpSet(conv) != algorithm_desc.tensor_ops_enabled()) {
+    return port::Status(
+        port::error::INTERNAL,
+        "Mismatch between cudnn conv and algorithm descriptors.");
+  }
 
   // Query the size of the workspace and allocate it.
   size_t size_in_bytes;
@@ -2648,10 +2664,11 @@ AllocateCudnnConvolutionBackwardFilterWorkspace(
     const CudnnTensorDescriptor& output_nd,
     const dnn::AlgorithmDesc& algorithm_desc,
     ScratchAllocator* scratch_allocator) {
-  // TODO(csigg): This has side effects on the convolution descriptor. It is
-  // functionally correct because the convolution is run with the algorithm of
-  // the last call to this function, but should be fixed anyway.
-  conv.set_use_tensor_op_math(algorithm_desc.tensor_ops_enabled());
+  if (IsTensorMathOpSet(conv) != algorithm_desc.tensor_ops_enabled()) {
+    return port::Status(
+        port::error::INTERNAL,
+        "Mismatch between cudnn conv and algorithm descriptors.");
+  }
 
   // Query the size of the workspace and allocate it.
   size_t size_in_bytes;
@@ -2685,18 +2702,42 @@ AllocateCudnnConvolutionBackwardFilterWorkspace(
   return scratch_allocator->AllocateBytes(size_in_bytes);
 }
 
-static bool TensorOpMathAvailable(int cc_major) {
-  return cc_major >= 7 && CUDNN_VERSION >= 7000 && TensorOpMathEnabled();
+port::StatusOr<bool> UseTensorOps(Stream* stream, dnn::DataType type,
+                                  absl::optional<dnn::AlgorithmDesc> desc) {
+  bool use_tensor_ops;
+  if (desc.has_value()) {
+    use_tensor_ops = desc->tensor_ops_enabled();
+    if (use_tensor_ops && !IsTensorMathAllowed(stream, type)) {
+      return port::Status(port::error::INVALID_ARGUMENT,
+                          "Algo requests disallowed tensor op evaluation.");
+    }
+  } else {
+    use_tensor_ops = IsTensorMathAllowed(stream, type);
+  }
+  return use_tensor_ops;
 }
 
+cudnnDataType_t GetRnnComputeType(dnn::DataType data_type);
+dnn::DataType GetConvAccumulatorType(dnn::DataType data_type);
+
 port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionForwardAlgorithm(
     Stream* stream, const CudnnHandle& cudnn,
     const dnn::AlgorithmConfig& algorithm_config,
     const CudnnTensorDescriptor& input_nd, const CudnnFilterDescriptor& filter,
-    const CudnnConvolutionDescriptor& conv,
+    dnn::DataType element_type,
+    const dnn::ConvolutionDescriptor& convolution_descriptor,
     const CudnnTensorDescriptor& output_nd, ScratchAllocator* scratch_allocator,
     DeviceMemory<uint8>* scratch) {
   absl::optional<dnn::AlgorithmDesc> algo_desc = algorithm_config.algorithm();
+
+  CudnnConvolutionDescriptor conv(
+      convolution_descriptor,
+      ToCudnnDataType(GetConvAccumulatorType(element_type)));
+  bool use_tensor_ops;
+  SE_ASSIGN_OR_RETURN(use_tensor_ops,
+                      UseTensorOps(stream, element_type, algo_desc));
+  conv.set_use_tensor_op_math(use_tensor_ops);
+
   if (!algo_desc.has_value()) {
     // Pick fastest algorithm within memory limit according to cuDNN's
     // heuristics.
@@ -2709,10 +2750,7 @@ port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionForwardAlgorithm(
                         GetCudnnConvolutionForwardAlgo(
                             cudnn, input_nd, filter, conv, output_nd,
                             specify_workspace_limit, memory_limit_bytes));
-    int cc_major, cc_minor;
-    std::tie(cc_major, cc_minor) = GetCcMajorMinor(stream);
-    algo_desc = dnn::AlgorithmDesc(
-        algo, /*use_tensor_ops=*/TensorOpMathAvailable(cc_major));
+    algo_desc = dnn::AlgorithmDesc(algo, use_tensor_ops);
   }
 
   const auto scratch_or = AllocateCudnnConvolutionForwardWorkspace(
@@ -2736,6 +2774,9 @@ port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionForwardAlgorithm(
                      "Returned status: ", scratch_or.status().ToString()));
   }
 
+  SE_ASSIGN_OR_RETURN(use_tensor_ops,
+                      UseTensorOps(stream, element_type, algo_desc));
+  conv.set_use_tensor_op_math(use_tensor_ops);
   SE_ASSIGN_OR_RETURN(*scratch, AllocateCudnnConvolutionForwardWorkspace(
                                     stream, cudnn, input_nd, filter, conv,
                                     output_nd, *algo_desc, scratch_allocator));
@@ -2746,10 +2787,19 @@ port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionBackwardDataAlgorithm(
     Stream* stream, const CudnnHandle& cudnn,
     const dnn::AlgorithmConfig& algorithm_config,
     const CudnnTensorDescriptor& input_nd, const CudnnFilterDescriptor& filter,
-    const CudnnConvolutionDescriptor& conv,
+    dnn::DataType element_type,
+    const dnn::ConvolutionDescriptor& convolution_descriptor,
     const CudnnTensorDescriptor& output_nd, ScratchAllocator* scratch_allocator,
     DeviceMemory<uint8>* scratch) {
   absl::optional<dnn::AlgorithmDesc> algo_desc = algorithm_config.algorithm();
+  CudnnConvolutionDescriptor conv(
+      convolution_descriptor,
+      ToCudnnDataType(GetConvAccumulatorType(element_type)));
+  bool use_tensor_ops;
+  SE_ASSIGN_OR_RETURN(use_tensor_ops,
+                      UseTensorOps(stream, element_type, algo_desc));
+  conv.set_use_tensor_op_math(use_tensor_ops);
+
   if (!algo_desc.has_value()) {
     // Pick fastest algorithm within memory limit according to cuDNN's
     // heuristics.
@@ -2762,10 +2812,7 @@ port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionBackwardDataAlgorithm(
                         GetCudnnConvolutionBackwardDataAlgo(
                             cudnn, input_nd, filter, conv, output_nd,
                             specify_workspace_limit, memory_limit_bytes));
-    int cc_major, cc_minor;
-    std::tie(cc_major, cc_minor) = GetCcMajorMinor(stream);
-    algo_desc = dnn::AlgorithmDesc(
-        algo, /*use_tensor_ops=*/TensorOpMathAvailable(cc_major));
+    algo_desc = dnn::AlgorithmDesc(algo, use_tensor_ops);
   }
 
   const auto scratch_or = AllocateCudnnConvolutionBackwardDataWorkspace(
@@ -2788,6 +2835,9 @@ port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionBackwardDataAlgorithm(
         "while a secondary algorithm is not provided.");
   }
 
+  SE_ASSIGN_OR_RETURN(use_tensor_ops,
+                      UseTensorOps(stream, element_type, algo_desc));
+  conv.set_use_tensor_op_math(use_tensor_ops);
   SE_ASSIGN_OR_RETURN(*scratch, AllocateCudnnConvolutionBackwardDataWorkspace(
                                     stream, cudnn, input_nd, filter, conv,
                                     output_nd, *algo_desc, scratch_allocator));
@@ -2798,10 +2848,19 @@ port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionBackwardFilterAlgorithm(
     Stream* stream, const CudnnHandle& cudnn,
     const dnn::AlgorithmConfig& algorithm_config,
     const CudnnTensorDescriptor& input_nd, const CudnnFilterDescriptor& filter,
-    const CudnnConvolutionDescriptor& conv,
+    dnn::DataType element_type,
+    const dnn::ConvolutionDescriptor& convolution_descriptor,
     const CudnnTensorDescriptor& output_nd, ScratchAllocator* scratch_allocator,
     DeviceMemory<uint8>* scratch) {
   absl::optional<dnn::AlgorithmDesc> algo_desc = algorithm_config.algorithm();
+  CudnnConvolutionDescriptor conv(
+      convolution_descriptor,
+      ToCudnnDataType(GetConvAccumulatorType(element_type)));
+  bool use_tensor_ops;
+  SE_ASSIGN_OR_RETURN(use_tensor_ops,
+                      UseTensorOps(stream, element_type, algo_desc));
+  conv.set_use_tensor_op_math(use_tensor_ops);
+
   if (!algo_desc.has_value()) {
     // Pick fastest algorithm within memory limit according to cuDNN's
     // heuristics.
@@ -2814,10 +2873,7 @@ port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionBackwardFilterAlgorithm(
                         GetCudnnConvolutionBackwardFilterAlgo(
                             cudnn, input_nd, filter, conv, output_nd,
                             specify_workspace_limit, memory_limit_bytes));
-    int cc_major, cc_minor;
-    std::tie(cc_major, cc_minor) = GetCcMajorMinor(stream);
-    algo_desc = dnn::AlgorithmDesc(
-        algo, /*use_tensor_ops=*/TensorOpMathAvailable(cc_major));
+    algo_desc = dnn::AlgorithmDesc(algo, use_tensor_ops);
   }
 
   auto scratch_or = AllocateCudnnConvolutionBackwardFilterWorkspace(
@@ -2840,6 +2896,9 @@ port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionBackwardFilterAlgorithm(
         "while a secondary algorithm is not provided.");
   }
 
+  SE_ASSIGN_OR_RETURN(use_tensor_ops,
+                      UseTensorOps(stream, element_type, algo_desc));
+  conv.set_use_tensor_op_math(use_tensor_ops);
   SE_ASSIGN_OR_RETURN(*scratch, AllocateCudnnConvolutionBackwardFilterWorkspace(
                                     stream, cudnn, input_nd, filter, conv,
                                     output_nd, *algo_desc, scratch_allocator));
@@ -3004,35 +3063,32 @@ port::Status CudnnSupport::DoPrepareForConvolution(
   CudnnTensorDescriptor output_nd(
       output_descriptor,
       ToCudnnDataType(element_type, output_descriptor.layout()));
-  CudnnConvolutionDescriptor conv(
-      convolution_descriptor,
-      ToCudnnDataType(GetConvAccumulatorType(element_type)));
 
   auto cudnn = cudnn_->GetHandle(parent_, stream);
 
   switch (kind) {
     case dnn::ConvolutionKind::FORWARD: {
-      SE_ASSIGN_OR_RETURN(
-          *algorithm_desc,
-          GetCudnnConvolutionForwardAlgorithm(
-              stream, cudnn, algorithm_config, input_nd, filter_nd, conv,
-              output_nd, scratch_allocator, scratch_memory));
+      SE_ASSIGN_OR_RETURN(*algorithm_desc,
+                          GetCudnnConvolutionForwardAlgorithm(
+                              stream, cudnn, algorithm_config, input_nd,
+                              filter_nd, element_type, convolution_descriptor,
+                              output_nd, scratch_allocator, scratch_memory));
       break;
     }
     case dnn::ConvolutionKind::BACKWARD_DATA: {
-      SE_ASSIGN_OR_RETURN(
-          *algorithm_desc,
-          GetCudnnConvolutionBackwardDataAlgorithm(
-              stream, cudnn, algorithm_config, input_nd, filter_nd, conv,
-              output_nd, scratch_allocator, scratch_memory));
+      SE_ASSIGN_OR_RETURN(*algorithm_desc,
+                          GetCudnnConvolutionBackwardDataAlgorithm(
+                              stream, cudnn, algorithm_config, input_nd,
+                              filter_nd, element_type, convolution_descriptor,
+                              output_nd, scratch_allocator, scratch_memory));
       break;
     }
     case dnn::ConvolutionKind::BACKWARD_FILTER: {
-      SE_ASSIGN_OR_RETURN(
-          *algorithm_desc,
-          GetCudnnConvolutionBackwardFilterAlgorithm(
-              stream, cudnn, algorithm_config, input_nd, filter_nd, conv,
-              output_nd, scratch_allocator, scratch_memory));
+      SE_ASSIGN_OR_RETURN(*algorithm_desc,
+                          GetCudnnConvolutionBackwardFilterAlgorithm(
+                              stream, cudnn, algorithm_config, input_nd,
+                              filter_nd, element_type, convolution_descriptor,
+                              output_nd, scratch_allocator, scratch_memory));
       break;
     }
     default:
@@ -3061,8 +3117,9 @@ port::Status CudnnSupport::DoConvolve(
   auto accumulator_type = GetConvAccumulatorType(element_type);
   CudnnConvolutionDescriptor conv(convolution_descriptor,
                                   ToCudnnDataType(accumulator_type));
-  // Set use_tensor_math param to correct value
-  conv.set_use_tensor_op_math(algorithm_desc.tensor_ops_enabled());
+  SE_ASSIGN_OR_RETURN(bool use_tensor_ops,
+                      UseTensorOps(stream, element_type, algorithm_desc));
+  conv.set_use_tensor_op_math(use_tensor_ops);
 
   auto cudnn = cudnn_->GetHandle(parent_, stream);
   // Alpha is the scaling factor for input.
@@ -3295,14 +3352,6 @@ port::Status CudnnSupport::DoConvolve(
   return port::Status::OK();
 }
 
-// A helper function to query if a CudnnConvolutionDescriptor has tensor_op_math
-// set
-static bool IsTensorMathOpSet(const CudnnConvolutionDescriptor& conv) {
-  cudnnMathType_t math_type;
-  CHECK_CUDNN_OK(cudnnGetConvolutionMathType(conv.handle(), &math_type));
-  return math_type == CUDNN_TENSOR_OP_MATH;
-}
-
 template <typename ElementType, typename BiasType, typename ScaleType,
           typename OutputType>
 port::Status CudnnSupport::DoFusedConvolveImpl(
@@ -3336,8 +3385,6 @@ port::Status CudnnSupport::DoFusedConvolveImpl(
       filter_descriptor,
       GetCudnnDataType<ElementType>(conv_input_descriptor.layout()));
   CudnnTensorDescriptor bias_nd(bias_descriptor, GetCudnnDataType<BiasType>());
-  CudnnConvolutionDescriptor conv(convolution_descriptor,
-                                  ToCudnnDataType(accumulator_type));
 
   auto cudnn = cudnn_->GetHandle(parent_, stream);
 
@@ -3347,9 +3394,14 @@ port::Status CudnnSupport::DoFusedConvolveImpl(
   SE_ASSIGN_OR_RETURN(
       dnn::AlgorithmDesc algo_desc,
       GetCudnnConvolutionForwardAlgorithm(
-          stream, cudnn, algorithm_config, conv_input_nd, filter, conv,
+          stream, cudnn, algorithm_config, conv_input_nd, filter,
+          dnn::ToDataType<ElementType>::value, convolution_descriptor,
           output_nd, scratch_allocator, &scratch));
 
+  CudnnConvolutionDescriptor conv(convolution_descriptor,
+                                  ToCudnnDataType(accumulator_type));
+  conv.set_use_tensor_op_math(algo_desc.tensor_ops_enabled());
+
   std::unique_ptr<GpuTimer, GpuTimerDeleter> timer;
   if (is_profiling) {
     timer.reset(new GpuTimer(parent_));  // NOLINT
@@ -3480,9 +3532,7 @@ bool CudnnSupport::GetRnnAlgorithms(
   for (auto i : algo_types) {
     out_algorithms->push_back({i, /*use_tensor_ops=*/false});
 #if CUDNN_VERSION >= 7100
-    if (RnnTensorOpMathEnabled()) {
-      out_algorithms->push_back({i, /*use_tensor_ops=*/true});
-    }
+    out_algorithms->push_back({i, /*use_tensor_ops=*/true});
 #endif
   }
   return true;
diff --git a/tensorflow/stream_executor/cuda/cuda_stub.cc b/tensorflow/stream_executor/cuda/cuda_stub.cc
index ce02be89c22..58c898a54ee 100644
--- a/tensorflow/stream_executor/cuda/cuda_stub.cc
+++ b/tensorflow/stream_executor/cuda/cuda_stub.cc
@@ -95,14 +95,12 @@ typedef void(CUDA_CB* CUhostFn)(void* userData);
 
 #if CUDA_VERSION < 10000
 #include "tensorflow/stream_executor/cuda/cuda_9_0.inc"
-#elif CUDA_VERSION == 10000
+#elif CUDA_VERSION < 10010
 #include "tensorflow/stream_executor/cuda/cuda_10_0.inc"
-#elif CUDA_VERSION <= 10010
+#elif CUDA_VERSION < 10020
 #include "tensorflow/stream_executor/cuda/cuda_10_1.inc"
-#elif CUDA_VERSION <= 10020
+#elif CUDA_VERSION < 11000
 #include "tensorflow/stream_executor/cuda/cuda_10_2.inc"
-#elif CUDA_VERSION <= 11000
-#include "tensorflow/stream_executor/cuda/cuda_11_0.inc"
 #else
-#error "We have no wrapper for this version."
+#include "tensorflow/stream_executor/cuda/cuda_11_0.inc"
 #endif
diff --git a/tensorflow/stream_executor/cuda/cudart_stub.cc b/tensorflow/stream_executor/cuda/cudart_stub.cc
index 3b9e0f2937b..2ab9d142e3c 100644
--- a/tensorflow/stream_executor/cuda/cudart_stub.cc
+++ b/tensorflow/stream_executor/cuda/cudart_stub.cc
@@ -53,16 +53,14 @@ cudaError_t GetSymbolNotFoundError() {
 // A bunch of new symbols were introduced in version 10
 #if CUDART_VERSION < 10000
 #include "tensorflow/stream_executor/cuda/cuda_runtime_9_0.inc"
-#elif CUDART_VERSION == 10000
+#elif CUDART_VERSION < 10010
 #include "tensorflow/stream_executor/cuda/cuda_runtime_10_0.inc"
-#elif CUDART_VERSION == 10010
+#elif CUDART_VERSION < 10020
 #include "tensorflow/stream_executor/cuda/cuda_runtime_10_1.inc"
-#elif CUDART_VERSION == 10020
+#elif CUDART_VERSION < 11000
 #include "tensorflow/stream_executor/cuda/cuda_runtime_10_2.inc"
-#elif CUDART_VERSION == 11000
-#include "tensorflow/stream_executor/cuda/cuda_runtime_11_0.inc"
 #else
-#error "We have no wrapper for this version."
+#include "tensorflow/stream_executor/cuda/cuda_runtime_11_0.inc"
 #endif
 #undef __dv
 #undef __CUDA_DEPRECATED
diff --git a/tensorflow/stream_executor/cuda/cusolver_stub.cc b/tensorflow/stream_executor/cuda/cusolver_stub.cc
index a4b9cc37f9b..edf87c3dc0b 100644
--- a/tensorflow/stream_executor/cuda/cusolver_stub.cc
+++ b/tensorflow/stream_executor/cuda/cusolver_stub.cc
@@ -53,14 +53,12 @@ cusolverStatus_t GetSymbolNotFoundError() {
 
 #if CUDA_VERSION < 10000
 #include "tensorflow/stream_executor/cuda/cusolver_dense_9_0.inc"
-#elif CUDA_VERSION == 10000
+#elif CUDA_VERSION < 10010
 #include "tensorflow/stream_executor/cuda/cusolver_dense_10_0.inc"
-#elif CUDA_VERSION == 10010
+#elif CUDA_VERSION < 10020
 #include "tensorflow/stream_executor/cuda/cusolver_dense_10_1.inc"
-#elif CUDA_VERSION == 10020
+#elif CUDA_VERSION < 11000
 #include "tensorflow/stream_executor/cuda/cusolver_dense_10_2.inc"
-#elif CUDA_VERSION == 11000
-#include "tensorflow/stream_executor/cuda/cusolver_dense_11_0.inc"
 #else
-#error "We don't have a wrapper for this version."
+#include "tensorflow/stream_executor/cuda/cusolver_dense_11_0.inc"
 #endif
diff --git a/tensorflow/stream_executor/cuda/cusparse_stub.cc b/tensorflow/stream_executor/cuda/cusparse_stub.cc
index ae56402fbc3..caed4d1008e 100644
--- a/tensorflow/stream_executor/cuda/cusparse_stub.cc
+++ b/tensorflow/stream_executor/cuda/cusparse_stub.cc
@@ -53,14 +53,12 @@ cusparseStatus_t GetSymbolNotFoundError() {
 
 #if CUDA_VERSION < 10000
 #include "tensorflow/stream_executor/cuda/cusparse_9_0.inc"
-#elif CUDA_VERSION == 10000
+#elif CUDA_VERSION < 10010
 #include "tensorflow/stream_executor/cuda/cusparse_10_0.inc"
-#elif CUDA_VERSION == 10010
+#elif CUDA_VERSION < 10020
 #include "tensorflow/stream_executor/cuda/cusparse_10_1.inc"
-#elif CUDA_VERSION == 10020
+#elif CUDA_VERSION < 11000
 #include "tensorflow/stream_executor/cuda/cusparse_10_2.inc"
-#elif CUSPARSE_VER_MAJOR == 11 && CUSPARSE_VER_MINOR == 0
-#include "tensorflow/stream_executor/cuda/cusparse_11_0.inc"
 #else
-#error "We don't have a wrapper for this version."
+#include "tensorflow/stream_executor/cuda/cusparse_11_0.inc"
 #endif
diff --git a/tensorflow/stream_executor/rocm/rocm_dnn.cc b/tensorflow/stream_executor/rocm/rocm_dnn.cc
index e0ead6d57e8..9c09784b3f1 100644
--- a/tensorflow/stream_executor/rocm/rocm_dnn.cc
+++ b/tensorflow/stream_executor/rocm/rocm_dnn.cc
@@ -263,7 +263,8 @@ namespace wrap {
   __macro(miopenFindConvolutionForwardAlgorithm)                     \
   __macro(miopenCreateTensorDescriptor)                              \
   __macro(miopenDestroyTensorDescriptor)                             \
-  __macro(miopenSet2dPoolingDescriptor)                              \
+  __macro(miopenSetNdPoolingDescriptor)                              \
+  __macro(miopenSetPoolingIndexType)                                 \
   __macro(miopenSetLRNDescriptor)                                    \
   __macro(miopenLRNGetWorkSpaceSize)                                 \
   __macro(miopenCreateConvolutionDescriptor)                         \
@@ -290,7 +291,7 @@ namespace wrap {
   __macro(miopenSetTensorDescriptor)                                 \
   __macro(miopenGetTensorDescriptorSize)                             \
   __macro(miopenPoolingForward)                                      \
-  __macro(miopenPoolingGetWorkSpaceSize)                             \
+  __macro(miopenPoolingGetWorkSpaceSizeV2                            \
   __macro(miopenPoolingBackward)                                     \
   __macro(miopenLRNForward)                                          \
   __macro(miopenLRNBackward)                                         \
@@ -605,6 +606,11 @@ MIOpenSupport::MIOpenSupport(GpuExecutor* parent) : parent_(parent) {
   // swich to Find Mode if env var TF_ROCM_USE_IMMEDIATE_MODE is set
   tensorflow::ReadBoolFromEnvVar("TF_ROCM_USE_IMMEDIATE_MODE", false,
                                  &use_immediate_mode_);
+
+  bool enable_pooling_cache = false;
+  tensorflow::ReadBoolFromEnvVar("TF_ROCM_BW_POOL_CACHE", false,
+                                 &enable_pooling_cache);
+  if (enable_pooling_cache) m_pooling_cache_allowed = true;
 }
 
 port::Status MIOpenSupport::Init() {
@@ -844,17 +850,19 @@ class ScopedPoolingDescriptor {
     std::transform(shape64.cbegin(), shape64.cend(), shape.begin(),
                    &CheckedNarrowing<int64, int>);
 
-    if (nd != 2) {
-      LOG(FATAL) << "miopen requires pooling dimensions be 2"
-                 << ToString(status);
-    }
-
-    status = wrap::miopenSet2dPoolingDescriptor(
+    status = wrap::miopenSetNdPoolingDescriptor(
         handle_,
         (pooling_descriptor.mode() == dnn::PoolingMode::kMaximum
              ? miopenPoolingMax
              : miopenPoolingAverage),
-        shape[0], shape[1], padding[0], padding[1], strides[0], strides[1]);
+        nd, shape.data(), padding.data(), strides.data());
+
+    // Note: The index type has to be uint32 type for now because MIOpen
+    // API assumes all input indexes to be the same type. Since a tensor
+    // descriptor can only use int32 type, the index type here need to be
+    // aligned with the tensor index type of the (input) tensor descritptor
+    status = wrap::miopenSetPoolingIndexType(handle_, miopenIndexUint32);
+
     if (status != miopenStatusSuccess) {
       LOG(FATAL) << "could not set miopen pooling descriptor: "
                  << ToString(status);
@@ -4009,10 +4017,94 @@ bool MIOpenSupport::DoPoolForward(
     const DeviceMemory<double>& input_data,
     const dnn::BatchDescriptor& output_dimensions,
     DeviceMemory<double>* output_data, ScratchAllocator* workspace_allocator) {
-  LOG(ERROR) << "miopen does not support pooling for dobule type yet";
+  LOG(ERROR) << "miopen does not support pooling for double type yet";
   return false;
 }
 
+bool PoolingWorkspaceDescriptor::IsSame(
+    const dnn::BatchDescriptor& input_dimensions,
+    const dnn::BatchDescriptor& output_dimensions,
+    const dnn::PoolingDescriptor& pooling_dimensions, int _type) {
+  return dtype == _type &&
+         input_dims ==
+             input_dimensions.full_dims(dnn::DataLayout::kBatchDepthYX) &&
+         output_dims ==
+             output_dimensions.full_dims(dnn::DataLayout::kBatchDepthYX) &&
+         op.mode() == pooling_dimensions.mode() &&
+         op.window() == pooling_dimensions.window() &&
+         op.padding() == pooling_dimensions.padding() &&
+         op.strides() == pooling_dimensions.strides();
+}
+
+bool PoolingWorkspaceCache::find(
+    const void* p, const dnn::BatchDescriptor& input_dimensions,
+    const dnn::BatchDescriptor& output_dimensions,
+    const dnn::PoolingDescriptor& pooling_dimensions, int _type,
+    PoolingWorkspaceDescriptor*& pdesc) {
+  pdesc = 0;
+  auto it = cache.find(p);
+  if (it == cache.end()) {
+    return false;
+  }
+  if (!it->second.IsSame(input_dimensions, output_dimensions,
+                         pooling_dimensions, _type)) {
+    return false;
+  }
+  pdesc = &it->second;
+  return true;
+}
+
+void PoolingWorkspaceCache::insert(
+    const void* p, const dnn::BatchDescriptor& input_dimensions,
+    const dnn::BatchDescriptor& output_dimensions,
+    const dnn::PoolingDescriptor& pooling_dimensions, int _type,
+    std::unique_ptr<TemporaryDeviceMemory<uint8>>& workspace, size_t wsp_size,
+    hipStream_t hip_stream) {
+  PoolingWorkspaceDescriptor* desc = 0;
+  auto it = cache.find(p);
+  if (it != cache.end()) {
+    // replacing an entry with the same pointer but different attributes
+    // (if everything matches, the caller is expected to reuse the entry)
+    desc = &it->second;
+    hipStreamSynchronize(hip_stream);
+    memory_used -= desc->workspace_size;
+  } else {
+    cache[p] = PoolingWorkspaceDescriptor();
+    desc = &cache[p];
+  }
+  desc->input_dims = input_dimensions.full_dims(dnn::DataLayout::kBatchDepthYX);
+  desc->output_dims =
+      output_dimensions.full_dims(dnn::DataLayout::kBatchDepthYX);
+  desc->op = pooling_dimensions;
+  desc->dtype = _type;
+  desc->timestamp = timestamp;
+  timestamp++;
+  desc->workspace = std::move(workspace);
+  desc->workspace_size = wsp_size;
+  memory_used += wsp_size;
+  trim(hip_stream);
+}
+
+void PoolingWorkspaceCache::trim(hipStream_t hip_stream) {
+  if (memory_used < memory_budget && cache.size() < trim_size) return;
+  bool must_sync = true;
+  while (true) {
+    int new_size = cache.size() - (cache.size() >> 2);
+    std::vector<const void*> old_entries;
+    for (auto& x : cache)
+      if (x.second.timestamp + new_size < timestamp)
+        old_entries.push_back(x.first);
+    if (old_entries.empty()) break;
+    if (must_sync) hipStreamSynchronize(hip_stream);
+    must_sync = true;
+    for (auto x : old_entries) {
+      memory_used -= cache[x].workspace_size;
+      cache.erase(x);
+    }
+    if (memory_used < memory_budget || cache.size() < 10) break;
+  }
+}
+
 bool MIOpenSupport::DoPoolForward(
     Stream* stream, const dnn::PoolingDescriptor& pooling_dimensions,
     const dnn::BatchDescriptor& input_dimensions,
@@ -4020,7 +4112,6 @@ bool MIOpenSupport::DoPoolForward(
     const dnn::BatchDescriptor& output_dimensions,
     DeviceMemory<float>* output_data, ScratchAllocator* workspace_allocator) {
   auto miopen = miopen_->GetHandle(parent_, stream);
-
   // Alpha is the scaling factor for input.
   float alpha = 1.0;
   // Beta is the scaling factor for output.
@@ -4030,10 +4121,48 @@ bool MIOpenSupport::DoPoolForward(
   ScopedTensorDescriptor dest_desc{output_dimensions, miopenFloat};
   ScopedPoolingDescriptor pooling_desc{pooling_dimensions};
 
+  bool do_backward = false;
+  uint8* workspace = 0;
+  size_t workspace_size = 0;
+  std::unique_ptr<TemporaryDeviceMemory<uint8>> wsp_mem;
+  if (m_pooling_cache_enabled) {
+    do_backward = true;
+    auto status = wrap::miopenPoolingGetWorkSpaceSizeV2(
+        pooling_desc.handle(), dest_desc.handle(), &workspace_size);
+    if (status != miopenStatusSuccess) {
+      LOG(ERROR)
+          << "failed to obtain workspace size for backward pooling on stream: "
+          << ToString(status);
+      return false;
+    }
+    if (workspace_size != 0) {
+      PoolingWorkspaceDescriptor* pdesc = 0;
+      bool cache_hit =
+          m_pooling_cache_allowed &&
+          m_pooling_cache.find(input_data.opaque(), input_dimensions,
+                               output_dimensions, pooling_dimensions,
+                               miopenFloat, pdesc);
+      if (cache_hit) {
+        // reusing the same buffer
+        workspace = reinterpret_cast<uint8*>(
+            pdesc->workspace->mutable_device_memory()->opaque());
+      } else {
+        wsp_mem = stream->AllocateTemporaryArray<uint8>(workspace_size)
+                      .ConsumeValueOrDie();
+        workspace = reinterpret_cast<uint8*>(
+            wsp_mem->mutable_device_memory()->opaque());
+        m_pooling_cache.insert(input_data.opaque(), input_dimensions,
+                               output_dimensions, pooling_dimensions,
+                               miopenFloat, wsp_mem, workspace_size,
+                               AsGpuStreamValue(stream));
+      }
+    }
+  }
+
   auto status = wrap::miopenPoolingForward(
       miopen.handle(), pooling_desc.handle(), &alpha, src_desc.handle(),
       input_data.opaque(), &beta, dest_desc.handle(), output_data->opaque(),
-      false, nullptr, 0);
+      do_backward, workspace, workspace_size);
   if (status != miopenStatusSuccess) {
     LOG(ERROR) << "failed to enqueue forward pooling on stream: "
                << ToString(status);
@@ -4072,6 +4201,118 @@ bool MIOpenSupport::DoPoolForward(
   return true;
 }
 
+template <class T>
+bool MIOpenSupport::DoPoolBackwardImpl(
+    Stream* stream, const dnn::PoolingDescriptor& pooling_dimensions,
+    const dnn::BatchDescriptor& input_dimensions,
+    const DeviceMemory<T>& input_data,
+    const dnn::BatchDescriptor& output_dimensions,
+    const DeviceMemory<T>& output_data, const DeviceMemory<T>& input_diff_data,
+    DeviceMemory<T>* output_diff_data, ScratchAllocator* workspace_allocator) {
+  auto miopen = miopen_->GetHandle(parent_, stream);
+  if (m_pooling_cache_allowed) m_pooling_cache_enabled = true;
+  // Alpha is the scaling factor for input.
+  float alpha = 1.0;
+  // Beta is the scaling factor for output.
+  float beta = 0.0;
+
+  auto type =
+      std::is_same<T, float>::value
+          ? miopenFloat
+          : (std::is_same<T, Eigen::half>::value ? miopenHalf
+                                                 : (miopenDataType_t)-1);
+
+  ScopedTensorDescriptor src_desc{input_dimensions, type};
+  ScopedTensorDescriptor dest_desc{output_dimensions, type};
+  ScopedPoolingDescriptor pooling_desc{pooling_dimensions};
+
+  uint8* workspace_ptr = 0;
+  DeviceMemory<uint8> workspace;
+  PoolingWorkspaceDescriptor* pdesc = 0;
+
+  size_t workspace_size_in_bytes = 0;
+  auto status = wrap::miopenPoolingGetWorkSpaceSizeV2(
+      pooling_desc.handle(), dest_desc.handle(), &workspace_size_in_bytes);
+  if (status != miopenStatusSuccess) {
+    LOG(ERROR)
+        << "failed to obtain workspace size for backward pooling on stream: "
+        << ToString(status);
+    return false;
+  }
+
+  // Allocate the workspace.
+  if (workspace_size_in_bytes > 0) {
+    bool cache_hit = m_pooling_cache_allowed &&
+                     m_pooling_cache.find(input_data.opaque(), input_dimensions,
+                                          output_dimensions, pooling_dimensions,
+                                          type, pdesc);
+    if (cache_hit) {
+      assert(pdesc != 0);
+      workspace_ptr = reinterpret_cast<uint8*>(
+          pdesc->workspace->mutable_device_memory()->opaque());
+      VLOG(1) << "Pooling cache hit";
+    } else {
+      VLOG(1) << "Pooling cache miss";
+      assert(workspace_allocator);
+      auto allocated =
+          workspace_allocator->AllocateBytes(workspace_size_in_bytes);
+      if (!allocated.ok() || (workspace = allocated.ValueOrDie()) == nullptr) {
+        LOG(ERROR) << "Failed to allocate backward pooling workspace";
+        return false;
+      }
+      DeviceMemory<uint8> dest2;  // duplicated dest from forward:
+      int64 dest2_size = 0;
+
+      // miopen requires the strides and dims to be ordered as BDYX.
+      std::vector<int64> dims64 =
+          output_dimensions.full_dims(dnn::DataLayout::kBatchDepthYX);
+      // miopen does not use strides and must have 4D tensor.
+      // std::vector<int> dims(pooling_dimensions.ndims() + 2);
+
+      dest2_size = sizeof(T);
+      for (auto& x : dims64) dest2_size *= x;
+
+      if (dest2_size > 0) {
+        assert(workspace_allocator);
+        auto allocated = workspace_allocator->AllocateBytes(dest2_size);
+        if (!allocated.ok() || (dest2 = allocated.ValueOrDie()) == nullptr) {
+          LOG(ERROR) << "Failed to allocate backward pooling workspace";
+          return false;
+        }
+      } else {
+        LOG(ERROR) << "Failed to calculate tensor size to chain forward and "
+                      "backward pooling";
+      }
+
+      status = wrap::miopenPoolingForward(
+          miopen.handle(), pooling_desc.handle(), &alpha, src_desc.handle(),
+          input_data.opaque(), &beta, dest_desc.handle(), dest2.opaque(), true,
+          workspace.opaque(), workspace_size_in_bytes);
+
+      if (status != miopenStatusSuccess) {
+        LOG(ERROR)
+            << "failed to enqueue forward pooling (before backward) on stream: "
+            << ToString(status);
+        return false;
+      }
+      workspace_ptr = reinterpret_cast<uint8*>(workspace.opaque());
+    }
+  }
+  status = wrap::miopenPoolingBackward(
+      miopen.handle(), pooling_desc.handle(), &alpha, dest_desc.handle(),
+      output_data.opaque(), dest_desc.handle(), input_diff_data.opaque(),
+      src_desc.handle(), input_data.opaque(), &beta, src_desc.handle(),
+      output_diff_data->opaque(), workspace_ptr);
+
+  if (status != miopenStatusSuccess) {
+    LOG(ERROR) << "failed to enqueue backward pooling on stream: "
+               << ToString(status);
+    return false;
+  }
+
+  return true;
+}
+
 bool MIOpenSupport::DoPoolBackward(
     Stream* stream, const dnn::PoolingDescriptor& pooling_dimensions,
     const dnn::BatchDescriptor& input_dimensions,
@@ -4094,91 +4335,10 @@ bool MIOpenSupport::DoPoolBackward(
     const DeviceMemory<float>& input_diff_data,
     DeviceMemory<float>* output_diff_data,
     ScratchAllocator* workspace_allocator) {
-  auto miopen = miopen_->GetHandle(parent_, stream);
-
-  // Alpha is the scaling factor for input.
-  float alpha = 1.0;
-  // Beta is the scaling factor for output.
-  float beta = 0.0;
-
-  ScopedTensorDescriptor src_desc{input_dimensions, miopenFloat};
-  ScopedTensorDescriptor dest_desc{output_dimensions, miopenFloat};
-  ScopedPoolingDescriptor pooling_desc{pooling_dimensions};
-
-  DeviceMemory<uint8> workspace;
-  size_t workspace_size_in_bytes = 0;
-  auto status = wrap::miopenPoolingGetWorkSpaceSize(dest_desc.handle(),
-                                                    &workspace_size_in_bytes);
-
-  if (status != miopenStatusSuccess) {
-    LOG(ERROR)
-        << "failed to obtain workspace size for backward pooling on stream: "
-        << ToString(status);
-    return false;
-  }
-
-  // Allocate the workspace.
-  if (workspace_size_in_bytes > 0) {
-    assert(workspace_allocator);
-    auto allocated =
-        workspace_allocator->AllocateBytes(workspace_size_in_bytes);
-    if (!allocated.ok() || (workspace = allocated.ValueOrDie()) == nullptr) {
-      LOG(ERROR) << "Failed to allocate backward pooling workspace";
-      return false;
-    }
-  }
-
-  DeviceMemory<uint8> dest2;  // duplicated dest from forward:
-  int dest2_size = 0;
-
-  // miopen requires the strides and dims to be ordered as BDYX.
-  std::vector<int64> dims64 =
-      output_dimensions.full_dims(dnn::DataLayout::kBatchDepthYX);
-
-  // miopen does not use strides and must have 4D tensor.
-  std::vector<int> dims(4);
-
-  std::transform(dims64.cbegin(), dims64.cend(), dims.begin(),
-                 &CheckedNarrowing<int64, int>);
-
-  dest2_size = dims[0] * dims[1] * dims[2] * dims[3] * sizeof(float);
-
-  if (dest2_size > 0) {
-    assert(workspace_allocator);
-    auto allocated = workspace_allocator->AllocateBytes(dest2_size);
-    if (!allocated.ok() || (dest2 = allocated.ValueOrDie()) == nullptr) {
-      LOG(ERROR) << "Failed to allocate backward pooling workspace";
-      return false;
-    }
-  } else {
-    LOG(ERROR) << "Failed to calculate tensor size to chain forward and "
-                  "backward pooling";
-  }
-
-  status = wrap::miopenPoolingForward(
-      miopen.handle(), pooling_desc.handle(), &alpha, src_desc.handle(),
-      input_data.opaque(), &beta, dest_desc.handle(), dest2.opaque(), true,
-      workspace.opaque(), workspace_size_in_bytes);
-
-  if (status != miopenStatusSuccess) {
-    LOG(ERROR)
-        << "failed to enqueue forward pooling (before backward) on stream: "
-        << ToString(status);
-    return false;
-  }
-
-  status = wrap::miopenPoolingBackward(
-      miopen.handle(), pooling_desc.handle(), &alpha, dest_desc.handle(),
-      dest2.opaque(), dest_desc.handle(), input_diff_data.opaque(),
-      src_desc.handle(), input_data.opaque(), &beta, src_desc.handle(),
-      output_diff_data->opaque(), workspace.opaque());
-
-  if (status != miopenStatusSuccess) {
-    LOG(ERROR) << "failed to enqueue backward pooling on stream: "
-               << ToString(status);
-    return false;
-  }
-  return true;
+  return DoPoolBackwardImpl(stream, pooling_dimensions, input_dimensions,
+                            input_data, output_dimensions, output_data,
+                            input_diff_data, output_diff_data,
+                            workspace_allocator);
 }
 
 bool MIOpenSupport::DoPoolBackward(
@@ -4190,91 +4350,10 @@ bool MIOpenSupport::DoPoolBackward(
     const DeviceMemory<Eigen::half>& input_diff_data,
     DeviceMemory<Eigen::half>* output_diff_data,
     ScratchAllocator* workspace_allocator) {
-  auto miopen = miopen_->GetHandle(parent_, stream);
-
-  // Alpha is the scaling factor for input.
-  float alpha = 1.0;
-  // Beta is the scaling factor for output.
-  float beta = 0.0;
-
-  ScopedTensorDescriptor src_desc{input_dimensions, miopenHalf};
-  ScopedTensorDescriptor dest_desc{output_dimensions, miopenHalf};
-  ScopedPoolingDescriptor pooling_desc{pooling_dimensions};
-
-  DeviceMemory<uint8> workspace;
-  size_t workspace_size_in_bytes = 0;
-  auto status = wrap::miopenPoolingGetWorkSpaceSize(dest_desc.handle(),
-                                                    &workspace_size_in_bytes);
-
-  if (status != miopenStatusSuccess) {
-    LOG(ERROR)
-        << "failed to obtain workspace size for backward pooling on stream: "
-        << ToString(status);
-    return false;
-  }
-
-  // Allocate the workspace.
-  if (workspace_size_in_bytes > 0) {
-    assert(workspace_allocator);
-    auto allocated =
-        workspace_allocator->AllocateBytes(workspace_size_in_bytes);
-    if (!allocated.ok() || (workspace = allocated.ValueOrDie()) == nullptr) {
-      LOG(ERROR) << "Failed to allocate backward pooling workspace";
-      return false;
-    }
-  }
-
-  DeviceMemory<uint8> dest2;  // duplicated dest from forward:
-  int dest2_size = 0;
-
-  // miopen requires the strides and dims to be ordered as BDYX.
-  std::vector<int64> dims64 =
-      output_dimensions.full_dims(dnn::DataLayout::kBatchDepthYX);
-
-  // miopen does not use strides and must have 4D tensor.
-  std::vector<int> dims(4);
-
-  std::transform(dims64.cbegin(), dims64.cend(), dims.begin(),
-                 &CheckedNarrowing<int64, int>);
-
-  dest2_size = dims[0] * dims[1] * dims[2] * dims[3] * sizeof(float);
-
-  if (dest2_size > 0) {
-    assert(workspace_allocator);
-    auto allocated = workspace_allocator->AllocateBytes(dest2_size);
-    if (!allocated.ok() || (dest2 = allocated.ValueOrDie()) == nullptr) {
-      LOG(ERROR) << "Failed to allocate backward pooling workspace";
-      return false;
-    }
-  } else {
-    LOG(ERROR) << "Failed to calculate tensor size to chain forward and "
-                  "backward pooling";
-  }
-
-  status = wrap::miopenPoolingForward(
-      miopen.handle(), pooling_desc.handle(), &alpha, src_desc.handle(),
-      input_data.opaque(), &beta, dest_desc.handle(), dest2.opaque(), true,
-      workspace.opaque(), workspace_size_in_bytes);
-
-  if (status != miopenStatusSuccess) {
-    LOG(ERROR)
-        << "failed to enqueue forward pooling (before backward) on stream: "
-        << ToString(status);
-    return false;
-  }
-
-  status = wrap::miopenPoolingBackward(
-      miopen.handle(), pooling_desc.handle(), &alpha, dest_desc.handle(),
-      dest2.opaque(), dest_desc.handle(), input_diff_data.opaque(),
-      src_desc.handle(), input_data.opaque(), &beta, src_desc.handle(),
-      output_diff_data->opaque(), workspace.opaque());
-
-  if (status != miopenStatusSuccess) {
-    LOG(ERROR) << "failed to enqueue backward pooling on stream: "
-               << ToString(status);
-    return false;
-  }
-  return true;
+  return DoPoolBackwardImpl(stream, pooling_dimensions, input_dimensions,
+                            input_data, output_dimensions, output_data,
+                            input_diff_data, output_diff_data,
+                            workspace_allocator);
 }
 
 bool MIOpenSupport::DoNormalizeWithDimensions(
diff --git a/tensorflow/stream_executor/rocm/rocm_dnn.h b/tensorflow/stream_executor/rocm/rocm_dnn.h
index 40e156b5f74..4f568702d96 100644
--- a/tensorflow/stream_executor/rocm/rocm_dnn.h
+++ b/tensorflow/stream_executor/rocm/rocm_dnn.h
@@ -20,6 +20,7 @@ limitations under the License.
 #define TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCM_DNN_H_
 
 #include "absl/synchronization/mutex.h"
+#include "rocm/include/miopen/miopen.h"
 #include "tensorflow/core/platform/thread_annotations.h"
 #include "tensorflow/stream_executor/dnn.h"
 #include "tensorflow/stream_executor/lib/status.h"
@@ -38,6 +39,39 @@ class MIOpenCTCLossDescriptor;
 // Opaque and unique identifier for the MIOpen plugin.
 extern const PluginId kMIOpenPlugin;
 
+struct PoolingWorkspaceDescriptor {
+  std::vector<int64> input_dims;
+  std::vector<int64> output_dims;
+  dnn::PoolingDescriptor op;
+  int dtype;
+  uint64_t timestamp;
+  std::unique_ptr<TemporaryDeviceMemory<uint8>> workspace;
+  size_t workspace_size;
+  bool IsSame(const dnn::BatchDescriptor& input_dimensions,
+              const dnn::BatchDescriptor& output_dimensions,
+              const dnn::PoolingDescriptor& pooling_dimensions, int _type);
+};
+
+struct PoolingWorkspaceCache {
+  std::map<const void*, PoolingWorkspaceDescriptor> cache;
+  const int trim_size = 1000;
+  const uint64_t memory_budget = 2e7;
+  uint64_t timestamp = 0;
+  uint64_t memory_used = 0;
+  bool find(const void* p, const dnn::BatchDescriptor& input_dimensions,
+            const dnn::BatchDescriptor& output_dimensions,
+            const dnn::PoolingDescriptor& pooling_dimensions, int _type,
+            PoolingWorkspaceDescriptor*& pdesc);
+  void insert(const void* p, const dnn::BatchDescriptor& input_dimensions,
+              const dnn::BatchDescriptor& output_dimensions,
+              const dnn::PoolingDescriptor& pooling_dimensions, int _type,
+              std::unique_ptr<TemporaryDeviceMemory<uint8>>& workspace,
+              size_t wsp_size, hipStream_t hip_stream);
+
+ private:
+  void trim(hipStream_t hip_stream);
+};
+
 // miopen-library based DNN support. For details on overridden interface
 // functions, see dnn.h.
 class MIOpenSupport : public dnn::DnnSupport {
@@ -664,6 +698,10 @@ class MIOpenSupport : public dnn::DnnSupport {
   // Provide access to the MIOpen handle.
   std::unique_ptr<class MIOpenAccess> miopen_;
 
+  PoolingWorkspaceCache m_pooling_cache;
+  bool m_pooling_cache_allowed = false;
+  bool m_pooling_cache_enabled = false;
+
   template <class T, class U>
   bool DoBatchNormalizationForwardImpl(
       Stream* stream, dnn::DataType input_data_type,
@@ -847,6 +885,36 @@ class MIOpenSupport : public dnn::DnnSupport {
       ScratchAllocator* scratch_allocator,
       std::vector<dnn::ProfileResult>* out_algorithms);
 
+  port::Status DoCtcLossImpl(
+      Stream* stream, const MIOpenRnnStateTensorDescriptor& probs_desc,
+      const DeviceMemoryBase probs_data, absl::Span<const int> labels_data,
+      absl::Span<const int> labels_lengths_data,
+      absl::Span<const int> input_lengths_data, DeviceMemoryBase costs_data,
+      const MIOpenRnnStateTensorDescriptor& grads_desc,
+      DeviceMemoryBase grads_data, const MIOpenCTCLossDescriptor& ctc_loss_desc,
+      DeviceMemory<uint8> scratch_memory);
+
+  port::Status DoPrepareForCtcLoss(
+      Stream* stream, dnn::DataType element_type,
+      const dnn::RnnStateTensorDescriptor& probs_desc,
+      const dnn::RnnStateTensorDescriptor& grads_desc,
+      absl::Span<const int> labels_data,
+      absl::Span<const int> labels_lengths_data,
+      absl::Span<const int> input_lengths_data,
+      ScratchAllocator* scratch_allocator,
+      DeviceMemory<uint8>* scratch_memory) override;
+
+  template <class T>
+  bool DoPoolBackwardImpl(Stream* stream,
+                          const dnn::PoolingDescriptor& pooling_dimensions,
+                          const dnn::BatchDescriptor& input_dimensions,
+                          const DeviceMemory<T>& input_data,
+                          const dnn::BatchDescriptor& output_dimensions,
+                          const DeviceMemory<T>& output_data,
+                          const DeviceMemory<T>& input_diff_data,
+                          DeviceMemory<T>* output_diff_data,
+                          ScratchAllocator* workspace_allocator = nullptr);
+
   SE_DISALLOW_COPY_AND_ASSIGN(MIOpenSupport);
 };
 
diff --git a/tensorflow/stream_executor/stream.cc b/tensorflow/stream_executor/stream.cc
index c63565c65a8..da418122375 100644
--- a/tensorflow/stream_executor/stream.cc
+++ b/tensorflow/stream_executor/stream.cc
@@ -285,7 +285,12 @@ Stream::~Stream() {
 
 port::Status Stream::RefreshStatus() {
   port::Status status = parent_->GetStatus(this);
-  CheckStatus(status);
+  // We should not put the stream in an error state, just because the GetStatus
+  // method is unimplemented.
+  if (status != port::Status(port::error::UNIMPLEMENTED,
+                             "GetStatus is not supported on this executor.")) {
+    CheckStatus(status);
+  }
   return status;
 }
 
diff --git a/tensorflow/stream_executor/tpu/BUILD b/tensorflow/stream_executor/tpu/BUILD
index 964f36b82c7..720ba6bc0c3 100644
--- a/tensorflow/stream_executor/tpu/BUILD
+++ b/tensorflow/stream_executor/tpu/BUILD
@@ -11,20 +11,25 @@ package(
 cc_library(
     name = "tpu_executor_c_api_hdrs",
     hdrs = ["tpu_executor_c_api.h"],
+    visibility = ["//visibility:public"],
     deps = [
         "//tensorflow/c:tf_attrtype",
-        "//tensorflow/c:tf_datatype",
         "//tensorflow/c:tf_status",
+        "//tensorflow/core/tpu:libtftpu_header",
         "//tensorflow/core/tpu/kernels:tpu_ops_common_c_api_hdrs",
     ],
+    alwayslink = True,
 )
 
 cc_library(
     name = "tpu_node_context_c_api_hdrs",
     hdrs = ["tpu_node_context_c_api.h"],
+    visibility = ["//visibility:public"],
     deps = [
         ":tpu_executor_c_api_hdrs",
+        "//tensorflow/core/tpu:libtftpu_header",
     ],
+    alwayslink = True,
 )
 
 cc_library(
@@ -65,6 +70,7 @@ cc_library(
         ":status_helper",
         ":tpu_executor_c_api_hdrs",
         ":tpu_stream_interface",
+        "//tensorflow/core/tpu:tpu_api",
         "//tensorflow/stream_executor:stream",
     ],
 )
@@ -75,6 +81,7 @@ cc_library(
     deps = [
         ":tpu_executor_c_api_hdrs",
         "//tensorflow/core/platform:types",
+        "//tensorflow/core/tpu:tpu_api",
         "//tensorflow/stream_executor:stream",
     ],
 )
@@ -94,6 +101,7 @@ cc_library(
         ":tpu_timer",
         "//tensorflow/c:tf_status",
         "//tensorflow/core:lib",
+        "//tensorflow/core/tpu:tpu_api",
         "//tensorflow/stream_executor:stream",
         "//tensorflow/stream_executor/lib",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -143,6 +151,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:stream_pool",
         "//tensorflow/compiler/xla/service:transfer_manager",
         "//tensorflow/core:framework",
+        "//tensorflow/core/tpu:tpu_api",
         "//tensorflow/stream_executor:device_memory_allocator",
         "//tensorflow/stream_executor/lib",
         "@com_google_absl//absl/memory",
@@ -160,6 +169,7 @@ cc_library(
         ":tpu_platform_interface",
         "//tensorflow/c:tf_status",
         "//tensorflow/core/platform:types",
+        "//tensorflow/core/tpu:tpu_api",
         "//tensorflow/stream_executor:stream",
         "@com_google_absl//absl/container:flat_hash_map",
     ],
@@ -191,6 +201,7 @@ cc_library(
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/service:shaped_buffer",
         "//tensorflow/compiler/xla/service:transfer_manager",
+        "//tensorflow/core/tpu:tpu_api",
         "//tensorflow/stream_executor:stream",
     ],
 )
@@ -217,6 +228,7 @@ cc_library(
         "//tensorflow/core/platform:types",
         "//tensorflow/stream_executor:multi_platform_manager",
         "//tensorflow/stream_executor:stream_executor_headers",
+        "//tensorflow/stream_executor:stream_executor_pimpl",
     ],
 )
 
diff --git a/tensorflow/stream_executor/tpu/tpu_executor.cc b/tensorflow/stream_executor/tpu/tpu_executor.cc
index 03cab5801e6..95c32714732 100644
--- a/tensorflow/stream_executor/tpu/tpu_executor.cc
+++ b/tensorflow/stream_executor/tpu/tpu_executor.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include "tensorflow/c/tf_status.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
+#include "tensorflow/core/tpu/tpu_api.h"
 #include "tensorflow/stream_executor/device_memory.h"
 #include "tensorflow/stream_executor/lib/status.h"
 #include "tensorflow/stream_executor/tpu/c_api_conversions.h"
@@ -33,63 +34,68 @@ namespace {
 using ::stream_executor::port::Status;
 }  // namespace
 
-TpuExecutor::~TpuExecutor() { TpuExecutor_Free(executor_); }
+TpuExecutor::~TpuExecutor() {
+  tpu::ExecutorApiFn()->TpuExecutor_FreeFn(executor_);
+}
 
 Status TpuExecutor::Init(int device_ordinal,
                          ::stream_executor::DeviceOptions device_options) {
   StatusHelper status;
   SE_DeviceOptions* options =
-      TpuExecutor_NewDeviceOptions(device_options.flags());
-  TpuExecutor_Init(executor_, device_ordinal, options, status.c_status);
-  TpuExecutor_FreeDeviceOptions(options);
+      tpu::ExecutorApiFn()->TpuExecutor_NewDeviceOptionsFn(
+          device_options.flags());
+  tpu::ExecutorApiFn()->TpuExecutor_InitFn(executor_, device_ordinal, options,
+                                           status.c_status);
+  tpu::ExecutorApiFn()->TpuExecutor_FreeDeviceOptionsFn(options);
   return status.status();
 }
 
 int TpuExecutor::PlatformDeviceCount() {
-  return TpuExecutor_PlatformDeviceCount(executor_);
+  return tpu::ExecutorApiFn()->TpuExecutor_PlatformDeviceCountFn(executor_);
 }
 
 void TpuExecutor::SyncAndForgetFailedStreams() {
-  TpuExecutor_SyncAndForgetFailedStreams(executor_);
+  tpu::ExecutorApiFn()->TpuExecutor_SyncAndForgetFailedStreamsFn(executor_);
 }
 
 bool TpuExecutor::SynchronizeAllActivity() {
-  return TpuExecutor_SynchronizeAllActivity(executor_);
+  return tpu::ExecutorApiFn()->TpuExecutor_SynchronizeAllActivityFn(executor_);
 }
 
 Status TpuExecutor::BlockHostUntilDone(Stream* stream) {
   StatusHelper status;
-  TpuExecutor_BlockHostUntilDone(
+  tpu::ExecutorApiFn()->TpuExecutor_BlockHostUntilDoneFn(
       executor_, stream_map().at(stream->implementation()), status.c_status);
   return status.status();
 }
 
 Status TpuExecutor::BlockUntilDoneOrFailed() {
   StatusHelper status;
-  TpuExecutor_BlockUntilDoneOrFailed(executor_, status.c_status);
+  tpu::ExecutorApiFn()->TpuExecutor_BlockUntilDoneOrFailedFn(executor_,
+                                                             status.c_status);
   return status.status();
 }
 
 Status TpuExecutor::GetStatus(Stream* stream) {
   StatusHelper status;
-  TpuExecutor_GetStatus(executor_, stream_map().at(stream->implementation()),
-                        status.c_status);
+  tpu::ExecutorApiFn()->TpuExecutor_GetStatusFn(
+      executor_, stream_map().at(stream->implementation()), status.c_status);
   return status.status();
 }
 
 bool TpuExecutor::AllocateStream(Stream* stream) {
-  return TpuExecutor_AllocateStream(executor_,
-                                    stream_map().at(stream->implementation()));
+  return tpu::ExecutorApiFn()->TpuExecutor_AllocateStreamFn(
+      executor_, stream_map().at(stream->implementation()));
 }
 
 void TpuExecutor::DeallocateStream(Stream* stream) {
-  TpuExecutor_DeallocateStream(executor_,
-                               stream_map().at(stream->implementation()));
+  tpu::ExecutorApiFn()->TpuExecutor_DeallocateStreamFn(
+      executor_, stream_map().at(stream->implementation()));
   stream_map().erase(stream->implementation());
 }
 
 bool TpuExecutor::CreateStreamDependency(Stream* dependent, Stream* other) {
-  return TpuExecutor_CreateStreamDependency(
+  return tpu::ExecutorApiFn()->TpuExecutor_CreateStreamDependencyFn(
       executor_, stream_map().at(dependent->implementation()),
       stream_map().at(other->implementation()));
 }
@@ -104,15 +110,15 @@ bool TpuExecutor::AllocateTimer(Timer* timer) { return true; }
 void TpuExecutor::DeallocateTimer(Timer* timer) {}
 
 bool TpuExecutor::StartTimer(Stream* stream, ::stream_executor::Timer* timer) {
-  return TpuExecutor_StartTimer(executor_,
-                                stream_map().at(stream->implementation()),
-                                timer_map_.at(timer->implementation()));
+  return tpu::ExecutorApiFn()->TpuExecutor_StartTimerFn(
+      executor_, stream_map().at(stream->implementation()),
+      timer_map_.at(timer->implementation()));
 }
 
 bool TpuExecutor::StopTimer(Stream* stream, ::stream_executor::Timer* timer) {
-  return TpuExecutor_StopTimer(executor_,
-                               stream_map().at(stream->implementation()),
-                               timer_map_.at(timer->implementation()));
+  return tpu::ExecutorApiFn()->TpuExecutor_StopTimerFn(
+      executor_, stream_map().at(stream->implementation()),
+      timer_map_.at(timer->implementation()));
 }
 
 stream_executor::Event::Status TpuExecutor::PollForEventStatus(
@@ -148,7 +154,7 @@ Status TpuExecutor::WaitForEvent(Stream* stream,
 // Called by Timer::Timer
 std::unique_ptr<::stream_executor::internal::TimerInterface>
 TpuExecutor::GetTimerImplementation() {
-  SE_Timer* tpu_timer = TpuTimer_New(executor_);
+  SE_Timer* tpu_timer = tpu::ExecutorApiFn()->TpuTimer_NewFn(executor_);
   auto ptr = absl::make_unique<TpuTimer>(tpu_timer);
   timer_map_[ptr.get()] = tpu_timer;
   return ptr;
@@ -157,7 +163,7 @@ TpuExecutor::GetTimerImplementation() {
 // Called by Stream::Stream
 std::unique_ptr<::stream_executor::internal::StreamInterface>
 TpuExecutor::GetStreamImplementation() {
-  SE_Stream* tpu_stream = TpuStream_New(executor_);
+  SE_Stream* tpu_stream = tpu::ExecutorApiFn()->TpuStream_NewFn(executor_);
   auto ptr = absl::make_unique<TpuStream>(tpu_stream);
   stream_map()[ptr.get()] = tpu_stream;
   return ptr;
@@ -166,34 +172,35 @@ TpuExecutor::GetStreamImplementation() {
 // Called by Event::Event
 std::unique_ptr<::stream_executor::internal::EventInterface>
 TpuExecutor::CreateEventImplementation() {
-  SE_Event* tpu_event = TpuEvent_New(executor_);
+  SE_Event* tpu_event = tpu::ExecutorApiFn()->TpuEvent_NewFn(executor_);
   auto ptr = absl::make_unique<TpuEvent>(tpu_event);
   event_map()[ptr.get()] = tpu_event;
   return ptr;
 }
 
 DeviceMemoryBase TpuExecutor::Allocate(uint64 size, int64 memory_space) {
-  SE_DeviceMemoryBase se_base =
-      TpuExecutor_Allocate(executor_, size, memory_space);
+  SE_DeviceMemoryBase se_base = tpu::ExecutorApiFn()->TpuExecutor_AllocateFn(
+      executor_, size, memory_space);
   return TpuConversions::SE_DeviceMemoryBaseToDeviceMemoryBase(se_base);
 }
 
 void TpuExecutor::Deallocate(const DeviceMemoryBase& memory) {
   SE_DeviceMemoryBase se_base =
       TpuConversions::DeviceMemoryBaseToSE_DeviceMemoryBase(memory);
-  TpuExecutor_Deallocate(executor_, &se_base);
+  tpu::ExecutorApiFn()->TpuExecutor_DeallocateFn(executor_, &se_base);
 }
 
 void TpuExecutor::Deallocate(DeviceMemoryBase* memory) {
   SE_DeviceMemoryBase se_base =
       TpuConversions::DeviceMemoryBaseToSE_DeviceMemoryBase(*memory);
-  TpuExecutor_Deallocate(executor_, &se_base);
+  tpu::ExecutorApiFn()->TpuExecutor_DeallocateFn(executor_, &se_base);
 }
 
 bool TpuExecutor::DeviceMemoryUsage(int64* free, int64* total) const {
   int64_t _free;
   int64_t _total;
-  if (TpuExecutor_DeviceMemoryUsage(executor_, &_free, &_total)) {
+  if (tpu::ExecutorApiFn()->TpuExecutor_DeviceMemoryUsageFn(executor_, &_free,
+                                                            &_total)) {
     *free = _free;
     *total = _total;
     return true;
@@ -204,7 +211,8 @@ bool TpuExecutor::DeviceMemoryUsage(int64* free, int64* total) const {
 absl::optional<stream_executor::AllocatorStats>
 TpuExecutor::GetAllocatorStats() {
   SE_AllocatorStats c_stats;
-  if (TpuExecutor_GetAllocatorStats(executor_, &c_stats)) {
+  if (tpu::ExecutorApiFn()->TpuExecutor_GetAllocatorStatsFn(executor_,
+                                                            &c_stats)) {
     ::stream_executor::AllocatorStats stats;
     stats.num_allocs = c_stats.num_allocs;
     stats.bytes_in_use = c_stats.bytes_in_use;
@@ -226,31 +234,33 @@ TpuExecutor::GetAllocatorStats() {
 
 Status TpuExecutor::WaitForInfeedReady(int32 infeed_queue_index) {
   StatusHelper status;
-  TpuExecutor_WaitForInfeedReady(executor_, infeed_queue_index,
-                                 status.c_status);
+  tpu::ExecutorApiFn()->TpuExecutor_WaitForInfeedReadyFn(
+      executor_, infeed_queue_index, status.c_status);
   return status.status();
 }
 
 Status TpuExecutor::WaitForOutfeedReady(int32 outfeed_queue_index) {
   StatusHelper status;
-  TpuExecutor_WaitForOutfeedReady(executor_, outfeed_queue_index,
-                                  status.c_status);
+  tpu::ExecutorApiFn()->TpuExecutor_WaitForOutfeedReadyFn(
+      executor_, outfeed_queue_index, status.c_status);
   return status.status();
 }
 
 void TpuExecutor::DequeueOutfeed(int32 outfeed_queue_index,
                                  absl::Span<uint8> bytes, StatusCallback done) {
   StatusHelper status;
-  TpuExecutor_DequeueOutfeed(executor_, outfeed_queue_index, bytes.data(),
-                             bytes.size(), status.c_status);
+  tpu::ExecutorApiFn()->TpuExecutor_DequeueOutfeedFn(
+      executor_, outfeed_queue_index, bytes.data(), bytes.size(),
+      status.c_status);
   done(status.status());
 }
 
 Status TpuExecutor::EnqueueInfeed(int32 infeed_queue_index,
                                   absl::Span<const uint8> bytes) {
   StatusHelper status;
-  TpuExecutor_EnqueueInfeed(executor_, infeed_queue_index, bytes.data(),
-                            bytes.size(), status.c_status);
+  tpu::ExecutorApiFn()->TpuExecutor_EnqueueInfeedFn(
+      executor_, infeed_queue_index, bytes.data(), bytes.size(),
+      status.c_status);
   return status.status();
 }
 
@@ -259,9 +269,9 @@ bool TpuExecutor::Memcpy(Stream* stream, void* host_dst,
                          uint64 size) {
   SE_DeviceMemoryBase se_base =
       TpuConversions::DeviceMemoryBaseToSE_DeviceMemoryBase(device_src);
-  return TpuExecutor_MemcpyToHost(executor_,
-                                  stream_map().at(stream->implementation()),
-                                  host_dst, &se_base, size);
+  return tpu::ExecutorApiFn()->TpuExecutor_MemcpyToHostFn(
+      executor_, stream_map().at(stream->implementation()), host_dst, &se_base,
+      size);
 }
 
 bool TpuExecutor::Memcpy(Stream* stream,
@@ -269,9 +279,9 @@ bool TpuExecutor::Memcpy(Stream* stream,
                          const void* host_src, uint64 size) {
   SE_DeviceMemoryBase se_base =
       TpuConversions::DeviceMemoryBaseToSE_DeviceMemoryBase(*device_dst);
-  return TpuExecutor_MemcpyFromHost(executor_,
-                                    stream_map().at(stream->implementation()),
-                                    &se_base, host_src, size);
+  return tpu::ExecutorApiFn()->TpuExecutor_MemcpyFromHostFn(
+      executor_, stream_map().at(stream->implementation()), &se_base, host_src,
+      size);
 }
 
 Status TpuExecutor::SynchronousMemcpy(
@@ -280,8 +290,8 @@ Status TpuExecutor::SynchronousMemcpy(
   StatusHelper status;
   SE_DeviceMemoryBase se_base =
       TpuConversions::DeviceMemoryBaseToSE_DeviceMemoryBase(*device_dst);
-  TpuExecutor_SynchronousMemcpyFromHost(executor_, &se_base, host_src, size,
-                                        status.c_status);
+  tpu::ExecutorApiFn()->TpuExecutor_SynchronousMemcpyFromHostFn(
+      executor_, &se_base, host_src, size, status.c_status);
   return status.status();
 }
 
@@ -291,8 +301,8 @@ Status TpuExecutor::SynchronousMemcpy(
   StatusHelper status;
   SE_DeviceMemoryBase se_base =
       TpuConversions::DeviceMemoryBaseToSE_DeviceMemoryBase(device_src);
-  TpuExecutor_SynchronousMemcpyToHost(executor_, host_dst, &se_base, size,
-                                      status.c_status);
+  tpu::ExecutorApiFn()->TpuExecutor_SynchronousMemcpyToHostFn(
+      executor_, host_dst, &se_base, size, status.c_status);
   return status.status();
 }
 
@@ -316,8 +326,8 @@ struct HostCallbackContext {
 SE_Status* HostCallbackTrampoline(void* ctx) {
   HostCallbackContext* host_ctx = reinterpret_cast<HostCallbackContext*>(ctx);
   Status status = host_ctx->callback();
-  SE_Status* c_status =
-      TpuStatus_Create(status.code(), status.error_message().c_str());
+  SE_Status* c_status = tpu::ExecutorApiFn()->TpuStatus_CreateFn(
+      status.code(), status.error_message().c_str());
   delete host_ctx;
   return c_status;
 }
@@ -325,18 +335,21 @@ SE_Status* HostCallbackTrampoline(void* ctx) {
 bool TpuExecutor::HostCallback(Stream* stream,
                                std::function<Status()> callback) {
   HostCallbackContext* ctx = new HostCallbackContext{callback};
-  return TpuExecutor_HostCallback(executor_,
-                                  stream_map().at(stream->implementation()),
-                                  &HostCallbackTrampoline, ctx);
+  return tpu::ExecutorApiFn()->TpuExecutor_HostCallbackFn(
+      executor_, stream_map().at(stream->implementation()),
+      &HostCallbackTrampoline, ctx);
 }
 
 TpuExecutor::StatusOr<std::unique_ptr<::stream_executor::DeviceDescription>>
 TpuExecutor::CreateDeviceDescription() const {
   StatusHelper status;
-  SE_DeviceDescription* description = TpuDeviceDescription_New();
-  auto cleanup = tensorflow::gtl::MakeCleanup(
-      [description]() { TpuDeviceDescription_Free(description); });
-  TpuExecutor_CreateDeviceDescription(executor_, description, status.c_status);
+  SE_DeviceDescription* description =
+      tpu::ExecutorApiFn()->TpuDeviceDescription_NewFn();
+  auto cleanup = tensorflow::gtl::MakeCleanup([description]() {
+    tpu::ExecutorApiFn()->TpuDeviceDescription_FreeFn(description);
+  });
+  tpu::ExecutorApiFn()->TpuExecutor_CreateDeviceDescriptionFn(
+      executor_, description, status.c_status);
   if (status.status().ok()) {
     stream_executor::internal::DeviceDescriptionBuilder builder;
     CHECK_NE(description->device_vendor, nullptr);
diff --git a/tensorflow/stream_executor/tpu/tpu_executor_c_api.h b/tensorflow/stream_executor/tpu/tpu_executor_c_api.h
index e77e09bb911..eee69a35b23 100644
--- a/tensorflow/stream_executor/tpu/tpu_executor_c_api.h
+++ b/tensorflow/stream_executor/tpu/tpu_executor_c_api.h
@@ -20,9 +20,9 @@ limitations under the License.
 #include <stdint.h>
 
 #include "tensorflow/c/tf_attrtype.h"
-#include "tensorflow/c/tf_datatype.h"
 #include "tensorflow/c/tf_status.h"
 #include "tensorflow/core/tpu/kernels/tpu_ops_common_c_api.h"
+#include "tensorflow/core/tpu/libtftpu.h"
 
 typedef struct SE_Platform SE_Platform;
 typedef struct SE_StreamExecutor SE_StreamExecutor;
@@ -292,6 +292,96 @@ void TpuTransferManager_WriteSingleTupleIndexTable(
 
 XLA_ComputationPlacer* TpuComputationPlacer_New();
 void TpuComputationPlacer_Free(XLA_ComputationPlacer* placer);
+
+struct TfTpu_ExecutorApiFn {
+  TFTPU_ADD_FN_IN_STRUCT(TpuPlatform_New);
+  TFTPU_ADD_FN_IN_STRUCT(TpuPlatform_Free);
+  TFTPU_ADD_FN_IN_STRUCT(TpuPlatform_Initialize);
+  TFTPU_ADD_FN_IN_STRUCT(TpuPlatform_Initialized);
+  TFTPU_ADD_FN_IN_STRUCT(TpuPlatform_GetExecutor);
+  TFTPU_ADD_FN_IN_STRUCT(TpuPlatform_Id);
+  TFTPU_ADD_FN_IN_STRUCT(TpuPlatform_VisibleDeviceCount);
+  TFTPU_ADD_FN_IN_STRUCT(TpuPlatform_TpuMemoryLimit);
+  TFTPU_ADD_FN_IN_STRUCT(TpuPlatform_ShouldRegisterTpuDeviceToDeviceCopy);
+  TFTPU_ADD_FN_IN_STRUCT(TpuExecutor_Init);
+  TFTPU_ADD_FN_IN_STRUCT(TpuExecutor_Free);
+  TFTPU_ADD_FN_IN_STRUCT(TpuExecutor_PlatformDeviceCount);
+  TFTPU_ADD_FN_IN_STRUCT(TpuExecutor_Allocate);
+  TFTPU_ADD_FN_IN_STRUCT(TpuExecutor_Deallocate);
+  TFTPU_ADD_FN_IN_STRUCT(TpuExecutor_GetAllocatorStats);
+  TFTPU_ADD_FN_IN_STRUCT(TpuExecutor_DeviceMemoryUsage);
+  TFTPU_ADD_FN_IN_STRUCT(TpuExecutor_AllocateStream);
+  TFTPU_ADD_FN_IN_STRUCT(TpuExecutor_DeallocateStream);
+  TFTPU_ADD_FN_IN_STRUCT(TpuExecutor_CreateStreamDependency);
+  TFTPU_ADD_FN_IN_STRUCT(TpuExecutor_GetStatus);
+  TFTPU_ADD_FN_IN_STRUCT(TpuExecutor_AllocateEvent);
+  TFTPU_ADD_FN_IN_STRUCT(TpuExecutor_DeallocateEvent);
+  TFTPU_ADD_FN_IN_STRUCT(TpuExecutor_PollForEventStatus);
+  TFTPU_ADD_FN_IN_STRUCT(TpuExecutor_RecordEvent);
+  TFTPU_ADD_FN_IN_STRUCT(TpuExecutor_WaitForEvent);
+  TFTPU_ADD_FN_IN_STRUCT(TpuExecutor_AllocateTimer);
+  TFTPU_ADD_FN_IN_STRUCT(TpuExecutor_DeallocateTimer);
+  TFTPU_ADD_FN_IN_STRUCT(TpuExecutor_StartTimer);
+  TFTPU_ADD_FN_IN_STRUCT(TpuExecutor_StopTimer);
+  TFTPU_ADD_FN_IN_STRUCT(TpuExecutor_SynchronousMemcpyToHost);
+  TFTPU_ADD_FN_IN_STRUCT(TpuExecutor_SynchronousMemcpyFromHost);
+  TFTPU_ADD_FN_IN_STRUCT(TpuExecutor_MemcpyToHost);
+  TFTPU_ADD_FN_IN_STRUCT(TpuExecutor_MemcpyFromHost);
+  TFTPU_ADD_FN_IN_STRUCT(TpuExecutor_EnqueueInfeed);
+  TFTPU_ADD_FN_IN_STRUCT(TpuExecutor_DequeueOutfeed);
+  TFTPU_ADD_FN_IN_STRUCT(TpuExecutor_WaitForInfeedReady);
+  TFTPU_ADD_FN_IN_STRUCT(TpuExecutor_WaitForOutfeedReady);
+  TFTPU_ADD_FN_IN_STRUCT(TpuExecutor_BlockHostUntilDone);
+  TFTPU_ADD_FN_IN_STRUCT(TpuExecutor_BlockUntilDoneOrFailed);
+  TFTPU_ADD_FN_IN_STRUCT(TpuExecutor_SyncAndForgetFailedStreams);
+  TFTPU_ADD_FN_IN_STRUCT(TpuExecutor_SynchronizeAllActivity);
+
+  TFTPU_ADD_FN_IN_STRUCT(TpuStream_New);
+  TFTPU_ADD_FN_IN_STRUCT(TpuStream_Free);
+  TFTPU_ADD_FN_IN_STRUCT(TpuStream_Stream);
+  TFTPU_ADD_FN_IN_STRUCT(TpuStream_Status);
+  TFTPU_ADD_FN_IN_STRUCT(TpuStream_IsSameSharedMemoryLocation);
+  TFTPU_ADD_FN_IN_STRUCT(TpuStream_TpuEnqueueOnDeviceSendRecvLocal);
+
+  TFTPU_ADD_FN_IN_STRUCT(TpuEvent_New);
+  TFTPU_ADD_FN_IN_STRUCT(TpuEvent_Free);
+
+  TFTPU_ADD_FN_IN_STRUCT(TpuTimer_New);
+  TFTPU_ADD_FN_IN_STRUCT(TpuTimer_Free);
+  TFTPU_ADD_FN_IN_STRUCT(TpuTimer_Nanoseconds);
+  TFTPU_ADD_FN_IN_STRUCT(TpuTimer_Microseconds);
+
+  TFTPU_ADD_FN_IN_STRUCT(TpuStatus_New);
+  TFTPU_ADD_FN_IN_STRUCT(TpuStatus_Create);
+  TFTPU_ADD_FN_IN_STRUCT(TpuStatus_Free);
+  TFTPU_ADD_FN_IN_STRUCT(TpuStatus_Message);
+  TFTPU_ADD_FN_IN_STRUCT(TpuStatus_Code);
+  TFTPU_ADD_FN_IN_STRUCT(TpuStatus_Ok);
+
+  TFTPU_ADD_FN_IN_STRUCT(TpuStreamExecutorConfig_Default);
+  TFTPU_ADD_FN_IN_STRUCT(TpuStreamExecutorConfig_SetOrdinal);
+  TFTPU_ADD_FN_IN_STRUCT(TpuStreamExecutorConfig_Free);
+
+  TFTPU_ADD_FN_IN_STRUCT(TpuDeviceDescription_New);
+  TFTPU_ADD_FN_IN_STRUCT(TpuDeviceDescription_Free);
+
+  TFTPU_ADD_FN_IN_STRUCT(TpuExecutor_CreateDeviceDescription);
+  TFTPU_ADD_FN_IN_STRUCT(TpuExecutor_NewDeviceOptions);
+  TFTPU_ADD_FN_IN_STRUCT(TpuExecutor_FreeDeviceOptions);
+  TFTPU_ADD_FN_IN_STRUCT(TpuExecutor_HostCallback);
+
+  TFTPU_ADD_FN_IN_STRUCT(TpuTransferManager_New);
+  TFTPU_ADD_FN_IN_STRUCT(TpuTransferManager_Free);
+  TFTPU_ADD_FN_IN_STRUCT(TpuTransferManager_PlatformId);
+  TFTPU_ADD_FN_IN_STRUCT(TpuTransferManager_HostShapeToDeviceShape);
+  TFTPU_ADD_FN_IN_STRUCT(TpuTransferManager_TransferLiteralToDeviceAsync);
+  TFTPU_ADD_FN_IN_STRUCT(TpuTransferManager_TransferLiteralFromDevice);
+  TFTPU_ADD_FN_IN_STRUCT(TpuTransferManager_GetByteSizeRequirement);
+  TFTPU_ADD_FN_IN_STRUCT(TpuTransferManager_WriteSingleTupleIndexTable);
+
+  TFTPU_ADD_FN_IN_STRUCT(TpuComputationPlacer_New);
+  TFTPU_ADD_FN_IN_STRUCT(TpuComputationPlacer_Free);
+};
 }
 
 // extern "C"
diff --git a/tensorflow/stream_executor/tpu/tpu_node_context.cc b/tensorflow/stream_executor/tpu/tpu_node_context.cc
index 35a9eb53bcd..b502264cfc7 100644
--- a/tensorflow/stream_executor/tpu/tpu_node_context.cc
+++ b/tensorflow/stream_executor/tpu/tpu_node_context.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/backend.h"
 #include "tensorflow/compiler/xla/service/platform_util.h"
 #include "tensorflow/compiler/xla/service/transfer_manager.h"
+#include "tensorflow/core/tpu/tpu_api.h"
 #include "tensorflow/stream_executor/device_memory_allocator.h"
 #include "tensorflow/stream_executor/tpu/tpu_executor_c_api.h"
 #include "tensorflow/stream_executor/tpu/tpu_node_context_c_api.h"
@@ -32,15 +33,18 @@ StatusOr<std::unique_ptr<TpuNodeContext>> TpuNodeContext::Create(
     int device_ordinal) {
   StatusHelper status;
   XLA_TpuNodeContext* node_context =
-      TpuNodeContext_Create(device_ordinal, status.c_status);
+      tpu::NodeContextApiFn()->TpuNodeContext_CreateFn(device_ordinal,
+                                                       status.c_status);
   if (!status.status().ok()) {
-    TpuNodeContext_Free(node_context);
+    tpu::NodeContextApiFn()->TpuNodeContext_FreeFn(node_context);
     return status.status();
   }
   return std::make_unique<TpuNodeContext>(device_ordinal, node_context);
 }
 
-TpuNodeContext::~TpuNodeContext() { TpuNodeContext_Free(node_context_); }
+TpuNodeContext::~TpuNodeContext() {
+  tpu::NodeContextApiFn()->TpuNodeContext_FreeFn(node_context_);
+}
 
 /* static */
 Status TpuNodeContext::Initialize(int device_ordinal) {
@@ -52,14 +56,14 @@ Status TpuNodeContext::Initialize(int device_ordinal) {
 /* static */
 Status TpuNodeContext::StopChipHeartbeats() {
   StatusHelper status;
-  TpuNodeContext_StopChipHeartbeats(status.c_status);
+  tpu::NodeContextApiFn()->TpuNodeContext_StopChipHeartbeatsFn(status.c_status);
   return status.status();
 }
 
 /* static */
 Status TpuNodeContext::CloseTpuHost() {
   StatusHelper status;
-  TpuNodeContext_CloseTpuHost(status.c_status);
+  tpu::NodeContextApiFn()->TpuNodeContext_CloseTpuHostFn(status.c_status);
   return status.status();
 }
 
diff --git a/tensorflow/stream_executor/tpu/tpu_node_context_c_api.h b/tensorflow/stream_executor/tpu/tpu_node_context_c_api.h
index e5092d4842b..d47fdf37a46 100644
--- a/tensorflow/stream_executor/tpu/tpu_node_context_c_api.h
+++ b/tensorflow/stream_executor/tpu/tpu_node_context_c_api.h
@@ -15,10 +15,13 @@ limitations under the License.
 #ifndef TENSORFLOW_STREAM_EXECUTOR_TPU_TPU_NODE_CONTEXT_C_API_H_
 #define TENSORFLOW_STREAM_EXECUTOR_TPU_TPU_NODE_CONTEXT_C_API_H_
 
+#include "tensorflow/core/tpu/libtftpu.h"
 #include "tensorflow/stream_executor/tpu/tpu_executor_c_api.h"
 
 typedef struct XLA_TpuNodeContext XLA_TpuNodeContext;
 
+extern "C" {
+
 XLA_TpuNodeContext* TpuNodeContext_Create(int device_ordinal,
                                           SE_Status* status);
 void TpuNodeContext_Free(XLA_TpuNodeContext* node_context);
@@ -28,4 +31,13 @@ void TpuNodeContext_Initialize(int device_ordinal, SE_Status* status);
 void TpuNodeContext_StopChipHeartbeats(SE_Status* status);
 void TpuNodeContext_CloseTpuHost(SE_Status* status);
 
+}  // extern "C"
+
+struct TfTpu_NodeContextApiFn {
+  TFTPU_ADD_FN_IN_STRUCT(TpuNodeContext_Create);
+  TFTPU_ADD_FN_IN_STRUCT(TpuNodeContext_Free);
+  TFTPU_ADD_FN_IN_STRUCT(TpuNodeContext_StopChipHeartbeats);
+  TFTPU_ADD_FN_IN_STRUCT(TpuNodeContext_CloseTpuHost);
+};
+
 #endif  // TENSORFLOW_STREAM_EXECUTOR_TPU_TPU_NODE_CONTEXT_C_API_H_
diff --git a/tensorflow/stream_executor/tpu/tpu_platform.cc b/tensorflow/stream_executor/tpu/tpu_platform.cc
index c65d8a4207a..97a97a63351 100644
--- a/tensorflow/stream_executor/tpu/tpu_platform.cc
+++ b/tensorflow/stream_executor/tpu/tpu_platform.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/tpu/tpu_platform.h"
 
 #include "tensorflow/c/tf_status.h"
+#include "tensorflow/core/tpu/tpu_api.h"
 #include "tensorflow/stream_executor/platform.h"
 #include "tensorflow/stream_executor/tpu/status_helper.h"
 #include "tensorflow/stream_executor/tpu/tpu_executor.h"
@@ -30,7 +31,9 @@ using Status = ::stream_executor::port::Status;
 template <typename T>
 using StatusOr = ::stream_executor::port::StatusOr<T>;
 
-TpuPlatform::TpuPlatform() { platform_ = TpuPlatform_New(); }
+TpuPlatform::TpuPlatform() {
+  platform_ = tpu::ExecutorApiFn()->TpuPlatform_NewFn();
+}
 
 TpuPlatform* TpuPlatform::GetRegisteredPlatform() {
   return tpu_registered_platform;
@@ -53,8 +56,8 @@ Status TpuPlatform::Initialize(
     i++;
   }
 
-  TpuPlatform_Initialize(platform_, options_size, options_key, options_value,
-                         status.c_status);
+  tpu::ExecutorApiFn()->TpuPlatform_InitializeFn(
+      platform_, options_size, options_key, options_value, status.c_status);
 
   free(options_key);
   free(options_value);
@@ -62,10 +65,16 @@ Status TpuPlatform::Initialize(
   return status.status();
 }
 
-TpuPlatform::~TpuPlatform() { TpuPlatform_Free(platform_); }
+bool TpuPlatform::Initialized() const {
+  return tpu::ExecutorApiFn()->TpuPlatform_InitializedFn(platform_);
+}
+
+TpuPlatform::~TpuPlatform() {
+  tpu::ExecutorApiFn()->TpuPlatform_FreeFn(platform_);
+}
 
 int TpuPlatform::VisibleDeviceCount() const {
-  return TpuPlatform_VisibleDeviceCount(platform_);
+  return tpu::ExecutorApiFn()->TpuPlatform_VisibleDeviceCountFn(platform_);
 }
 
 StatusOr<::stream_executor::StreamExecutor*> TpuPlatform::GetExecutor(
@@ -77,14 +86,16 @@ StatusOr<::stream_executor::StreamExecutor*> TpuPlatform::GetExecutor(
 StatusOr<std::unique_ptr<::stream_executor::StreamExecutor>>
 TpuPlatform::GetUncachedExecutor(
     const ::stream_executor::StreamExecutorConfig& config) {
-  SE_StreamExecutorConfig* c_config = TpuStreamExecutorConfig_Default();
+  SE_StreamExecutorConfig* c_config =
+      tpu::ExecutorApiFn()->TpuStreamExecutorConfig_DefaultFn();
 
-  TpuStreamExecutorConfig_SetOrdinal(c_config, config.ordinal);
+  tpu::ExecutorApiFn()->TpuStreamExecutorConfig_SetOrdinalFn(c_config,
+                                                             config.ordinal);
 
   StatusHelper status;
-  SE_StreamExecutor* executor =
-      TpuPlatform_GetExecutor(platform_, c_config, status.c_status);
-  TpuStreamExecutorConfig_Free(c_config);
+  SE_StreamExecutor* executor = tpu::ExecutorApiFn()->TpuPlatform_GetExecutorFn(
+      platform_, c_config, status.c_status);
+  tpu::ExecutorApiFn()->TpuStreamExecutorConfig_FreeFn(c_config);
   if (!status.ok()) {
     return status.status();
   }
@@ -103,21 +114,24 @@ const std::string& TpuPlatform::Name() const {
 }
 
 int64 TpuPlatform::TpuMemoryLimit() {
-  return TpuPlatform_TpuMemoryLimit(platform_);
+  return tpu::ExecutorApiFn()->TpuPlatform_TpuMemoryLimitFn(platform_);
 }
 
 bool TpuPlatform::ShouldRegisterTpuDeviceToDeviceCopy() {
-  return TpuPlatform_ShouldRegisterTpuDeviceToDeviceCopy(platform_);
+  return tpu::ExecutorApiFn()
+      ->TpuPlatform_ShouldRegisterTpuDeviceToDeviceCopyFn(platform_);
 }
 
-}  // namespace tensorflow
-
 void RegisterTpuPlatform() {
-  tensorflow::tpu_registered_platform = new tensorflow::TpuPlatform();
-  std::unique_ptr<stream_executor::Platform> platform(
-      tensorflow::tpu_registered_platform);
-  SE_CHECK_OK(stream_executor::MultiPlatformManager::RegisterPlatform(
-      std::move(platform)));
+  static bool tpu_platform_registered = false;
+  if (!tpu_platform_registered) {
+    tensorflow::tpu_registered_platform = new tensorflow::TpuPlatform();
+    std::unique_ptr<stream_executor::Platform> platform(
+        tensorflow::tpu_registered_platform);
+    SE_CHECK_OK(stream_executor::MultiPlatformManager::RegisterPlatform(
+        std::move(platform)));
+    tpu_platform_registered = true;
+  }
 }
 
 REGISTER_MODULE_INITIALIZER(tpu_platform, RegisterTpuPlatform());
@@ -127,3 +141,5 @@ REGISTER_MODULE_INITIALIZER(tpu_platform, RegisterTpuPlatform());
 REGISTER_MODULE_INITIALIZER_SEQUENCE(tpu_platform, multi_platform_manager);
 REGISTER_MODULE_INITIALIZER_SEQUENCE(multi_platform_manager_listener,
                                      tpu_platform);
+
+}  // namespace tensorflow
diff --git a/tensorflow/stream_executor/tpu/tpu_platform.h b/tensorflow/stream_executor/tpu/tpu_platform.h
index 6fdd8d15aa4..c2673ab9288 100644
--- a/tensorflow/stream_executor/tpu/tpu_platform.h
+++ b/tensorflow/stream_executor/tpu/tpu_platform.h
@@ -60,9 +60,7 @@ class TpuPlatform : public ::tensorflow::tpu::TpuPlatformInterface {
 
   bool ShouldRegisterTpuDeviceToDeviceCopy() override;
 
-  bool Initialized() const override {
-    return TpuPlatform_Initialized(platform_);
-  }
+  bool Initialized() const override;
 
   Status Initialize(
       const std::map<std::string, std::string>& platform_options) override;
@@ -124,6 +122,8 @@ class TpuPlatform : public ::tensorflow::tpu::TpuPlatformInterface {
   EventMap event_map_;
 };
 
+void RegisterTpuPlatform();
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_TPU_TPU_PLATFORM_H_
diff --git a/tensorflow/stream_executor/tpu/tpu_stream.h b/tensorflow/stream_executor/tpu/tpu_stream.h
index 5c71c0535f3..209a624b462 100644
--- a/tensorflow/stream_executor/tpu/tpu_stream.h
+++ b/tensorflow/stream_executor/tpu/tpu_stream.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_STREAM_EXECUTOR_TPU_TPU_STREAM_H_
 #define TENSORFLOW_STREAM_EXECUTOR_TPU_TPU_STREAM_H_
 
+#include "tensorflow/core/tpu/tpu_api.h"
 #include "tensorflow/stream_executor/stream_executor_internal.h"
 #include "tensorflow/stream_executor/tpu/c_api_conversions.h"
 #include "tensorflow/stream_executor/tpu/status_helper.h"
@@ -27,23 +28,27 @@ class TpuStream : public tensorflow::tpu::TpuStreamInterface {
   using Status = stream_executor::port::Status;
 
   explicit TpuStream(SE_Stream* stream) : stream_(stream) {}
-  ~TpuStream() override { TpuStream_Free(stream_); }
+  ~TpuStream() override {
+    tensorflow::tpu::ExecutorApiFn()->TpuStream_FreeFn(stream_);
+  }
 
   bool IsSameSharedMemoryLocation(
       tensorflow::tpu::TpuStreamInterface* other) override {
-    return TpuStream_IsSameSharedMemoryLocation(
-        stream_, static_cast<TpuStream*>(other)->stream_);
+    return tensorflow::tpu::ExecutorApiFn()
+        ->TpuStream_IsSameSharedMemoryLocationFn(
+            stream_, static_cast<TpuStream*>(other)->stream_);
   }
 
   Status EnqueueOnTpuDeviceSendRecvLocal(
       stream_executor::DeviceMemoryBase send_buffer,
       stream_executor::DeviceMemoryBase recv_buffer) override {
     StatusHelper status;
-    TpuStream_TpuEnqueueOnDeviceSendRecvLocal(
-        stream_,
-        TpuConversions::DeviceMemoryBaseToSE_DeviceMemoryBase(send_buffer),
-        TpuConversions::DeviceMemoryBaseToSE_DeviceMemoryBase(recv_buffer),
-        status.c_status);
+    tensorflow::tpu::ExecutorApiFn()
+        ->TpuStream_TpuEnqueueOnDeviceSendRecvLocalFn(
+            stream_,
+            TpuConversions::DeviceMemoryBaseToSE_DeviceMemoryBase(send_buffer),
+            TpuConversions::DeviceMemoryBaseToSE_DeviceMemoryBase(recv_buffer),
+            status.c_status);
     return status.status();
   }
 
@@ -54,7 +59,9 @@ class TpuStream : public tensorflow::tpu::TpuStreamInterface {
 class TpuEvent : public ::stream_executor::internal::EventInterface {
  public:
   explicit TpuEvent(SE_Event* event) : event_(event) {}
-  ~TpuEvent() override { TpuEvent_Free(event_); }
+  ~TpuEvent() override {
+    tensorflow::tpu::ExecutorApiFn()->TpuEvent_FreeFn(event_);
+  }
 
  private:
   SE_Event* event_;
diff --git a/tensorflow/stream_executor/tpu/tpu_timer.h b/tensorflow/stream_executor/tpu/tpu_timer.h
index 246a0b7eb32..0ad48ce8a80 100644
--- a/tensorflow/stream_executor/tpu/tpu_timer.h
+++ b/tensorflow/stream_executor/tpu/tpu_timer.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_STREAM_EXECUTOR_TPU_TPU_TIMER_H_
 
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/tpu/tpu_api.h"
 #include "tensorflow/stream_executor/stream_executor_internal.h"
 #include "tensorflow/stream_executor/tpu/tpu_executor_c_api.h"
 
@@ -25,9 +26,15 @@ namespace tensorflow {
 class TpuTimer : public ::stream_executor::internal::TimerInterface {
  public:
   explicit TpuTimer(SE_Timer* timer) : timer_(timer) {}
-  ~TpuTimer() override { TpuTimer_Free(timer_); }
-  uint64 Microseconds() const override { return TpuTimer_Microseconds(timer_); }
-  uint64 Nanoseconds() const override { return TpuTimer_Nanoseconds(timer_); }
+  ~TpuTimer() override {
+    tensorflow::tpu::ExecutorApiFn()->TpuTimer_FreeFn(timer_);
+  }
+  uint64 Microseconds() const override {
+    return tensorflow::tpu::ExecutorApiFn()->TpuTimer_MicrosecondsFn(timer_);
+  }
+  uint64 Nanoseconds() const override {
+    return tensorflow::tpu::ExecutorApiFn()->TpuTimer_NanosecondsFn(timer_);
+  }
 
  private:
   SE_Timer* timer_;
diff --git a/tensorflow/stream_executor/tpu/tpu_transfer_manager.cc b/tensorflow/stream_executor/tpu/tpu_transfer_manager.cc
index 4bedc251413..c55af7d58b9 100644
--- a/tensorflow/stream_executor/tpu/tpu_transfer_manager.cc
+++ b/tensorflow/stream_executor/tpu/tpu_transfer_manager.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/tpu/tpu_api.h"
 #include "tensorflow/stream_executor/device_memory.h"
 #include "tensorflow/stream_executor/tpu/c_api_conversions.h"
 #include "tensorflow/stream_executor/tpu/proto_helper.h"
@@ -29,10 +30,12 @@ namespace tensorflow {
 using Status = stream_executor::port::Status;
 
 TpuTransferManager::TpuTransferManager() {
-  manager_ = TpuTransferManager_New();
+  manager_ = tpu::ExecutorApiFn()->TpuTransferManager_NewFn();
 }
 
-TpuTransferManager::~TpuTransferManager() { TpuTransferManager_Free(manager_); }
+TpuTransferManager::~TpuTransferManager() {
+  tpu::ExecutorApiFn()->TpuTransferManager_FreeFn(manager_);
+}
 
 stream_executor::Platform::Id TpuTransferManager::PlatformId() const {
   return TpuPlatform::kId;
@@ -45,8 +48,8 @@ xla::Shape TpuTransferManager::HostShapeToDeviceShape(
 
   TpuConversions::XlaShapeToCShape(host_shape, &c_host_shape);
 
-  TpuTransferManager_HostShapeToDeviceShape(manager_, &c_host_shape,
-                                            &c_device_shape);
+  tpu::ExecutorApiFn()->TpuTransferManager_HostShapeToDeviceShapeFn(
+      manager_, &c_host_shape, &c_device_shape);
   xla::Shape device_shape = TpuConversions::CShapeToXlaShape(&c_device_shape);
   TpuConversions::CShapeCleanup(&c_host_shape);
   TpuConversions::CShapeCleanup(&c_device_shape);
@@ -66,7 +69,7 @@ Status TpuTransferManager::TransferLiteralToDeviceAsync(
   TpuConversions::XLAShapedBufferToCShapedBuffer(device_buffer,
                                                  &c_device_buffer);
 
-  TpuTransferManager_TransferLiteralToDeviceAsync(
+  tpu::ExecutorApiFn()->TpuTransferManager_TransferLiteralToDeviceAsyncFn(
       manager_,
       TpuPlatform::GetRegisteredPlatform()->stream_map()->at(
           stream->implementation()),
@@ -112,7 +115,7 @@ void TpuTransferManager::TransferLiteralFromDevice(
   XLA_Literal c_literal;
   TpuConversions::XLALiteralToCLiteral(literal, &c_literal);
 
-  TpuTransferManager_TransferLiteralFromDevice(
+  tpu::ExecutorApiFn()->TpuTransferManager_TransferLiteralFromDeviceFn(
       manager_,
       TpuPlatform::GetRegisteredPlatform()->stream_map()->at(
           stream->implementation()),
@@ -127,7 +130,8 @@ int64 TpuTransferManager::GetByteSizeRequirement(
   TpuConversions::XlaShapeToCShape(shape, &c_shape);
 
   int64 size_in_bytes =
-      TpuTransferManager_GetByteSizeRequirement(manager_, &c_shape);
+      tpu::ExecutorApiFn()->TpuTransferManager_GetByteSizeRequirementFn(
+          manager_, &c_shape);
 
   TpuConversions::CShapeCleanup(&c_shape);
   return size_in_bytes;
@@ -151,7 +155,7 @@ Status TpuTransferManager::WriteSingleTupleIndexTable(
                                   region->payload()};
   StatusHelper status;
 
-  TpuTransferManager_WriteSingleTupleIndexTable(
+  tpu::ExecutorApiFn()->TpuTransferManager_WriteSingleTupleIndexTableFn(
       manager_,
       TpuPlatform::GetRegisteredPlatform()->stream_map()->at(
           stream->implementation()),
diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl
index f97363a919e..4a4f8837867 100644
--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl
@@ -1817,13 +1817,13 @@ def tf_custom_op_library_additional_deps_impl():
 # tf_collected_deps will be the union of the deps of the current target
 # and the tf_collected_deps of the dependencies of this target.
 def _collect_deps_aspect_impl(target, ctx):
-    alldeps = depset()
+    direct, transitive = [], []
     if hasattr(ctx.rule.attr, "deps"):
         for dep in ctx.rule.attr.deps:
-            alldeps = depset([dep.label], transitive = [alldeps])
+            direct.append(dep.label)
             if hasattr(dep, "tf_collected_deps"):
-                alldeps = depset(transitive = [alldeps, dep.tf_collected_deps])
-    return struct(tf_collected_deps = alldeps)
+                transitive.append(dep.tf_collected_deps)
+    return struct(tf_collected_deps = depset(direct = direct, transitive = transitive))
 
 collect_deps_aspect = aspect(
     attr_aspects = ["deps"],
@@ -2899,6 +2899,13 @@ def if_mlir(if_true, if_false = []):
         "//conditions:default": if_false,
     })
 
+def if_tpu(if_true, if_false = []):
+    """Shorthand for select()ing whether to build for TPUs."""
+    return select({
+        str(Label("//tensorflow:with_tpu_support")): if_true,
+        "//conditions:default": if_false,
+    })
+
 def tfcompile_target_cpu():
     return ""
 
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.-mirrored-strategy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-mirrored-strategy.pbtxt
index 36c78c406b7..0c5db602029 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.distribute.-mirrored-strategy.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-mirrored-strategy.pbtxt
@@ -4,6 +4,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.distribute.distribute_lib.StrategyV1\'>"
   is_instance: "<class \'tensorflow.python.distribute.distribute_lib.StrategyBase\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "cluster_resolver"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "extended"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.-one-device-strategy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-one-device-strategy.pbtxt
index 09865ab02ee..ae62acffa44 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.distribute.-one-device-strategy.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-one-device-strategy.pbtxt
@@ -4,6 +4,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.distribute.distribute_lib.StrategyV1\'>"
   is_instance: "<class \'tensorflow.python.distribute.distribute_lib.StrategyBase\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "cluster_resolver"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "extended"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.-strategy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-strategy.pbtxt
index 0e6c10bd533..9285405ea4f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.distribute.-strategy.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-strategy.pbtxt
@@ -3,6 +3,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.distribute.distribute_lib.StrategyV1\'>"
   is_instance: "<class \'tensorflow.python.distribute.distribute_lib.StrategyBase\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "cluster_resolver"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "extended"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.cluster_resolver.-cluster-resolver.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.cluster_resolver.-cluster-resolver.pbtxt
index 0b35b61b4c0..c9dbca7368e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.distribute.cluster_resolver.-cluster-resolver.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.cluster_resolver.-cluster-resolver.pbtxt
@@ -6,6 +6,14 @@ tf_class {
     name: "environment"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "task_id"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "task_type"
+    mtype: "<type \'property\'>"
+  }
   member_method {
     name: "__init__"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.cluster_resolver.-kubernetes-cluster-resolver.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.cluster_resolver.-kubernetes-cluster-resolver.pbtxt
index 3220d68e054..2819ca85612 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.distribute.cluster_resolver.-kubernetes-cluster-resolver.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.cluster_resolver.-kubernetes-cluster-resolver.pbtxt
@@ -7,6 +7,14 @@ tf_class {
     name: "environment"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "task_id"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "task_type"
+    mtype: "<type \'property\'>"
+  }
   member_method {
     name: "__init__"
     argspec: "args=[\'self\', \'job_to_label_mapping\', \'tf_server_port\', \'rpc_layer\', \'override_client\'], varargs=None, keywords=None, defaults=[\'None\', \'8470\', \'grpc\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.cluster_resolver.-slurm-cluster-resolver.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.cluster_resolver.-slurm-cluster-resolver.pbtxt
index 30cfac0830c..a2c63243c86 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.distribute.cluster_resolver.-slurm-cluster-resolver.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.cluster_resolver.-slurm-cluster-resolver.pbtxt
@@ -7,6 +7,14 @@ tf_class {
     name: "environment"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "task_id"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "task_type"
+    mtype: "<type \'property\'>"
+  }
   member_method {
     name: "__init__"
     argspec: "args=[\'self\', \'jobs\', \'port_base\', \'gpus_per_node\', \'gpus_per_task\', \'tasks_per_node\', \'auto_set_gpu\', \'rpc_layer\'], varargs=None, keywords=None, defaults=[\'None\', \'8888\', \'None\', \'None\', \'None\', \'True\', \'grpc\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.cluster_resolver.-t-p-u-cluster-resolver.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.cluster_resolver.-t-p-u-cluster-resolver.pbtxt
index 658212aca5e..165f8d43ace 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.distribute.cluster_resolver.-t-p-u-cluster-resolver.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.cluster_resolver.-t-p-u-cluster-resolver.pbtxt
@@ -7,6 +7,14 @@ tf_class {
     name: "environment"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "task_id"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "task_type"
+    mtype: "<type \'property\'>"
+  }
   member_method {
     name: "__init__"
     argspec: "args=[\'self\', \'tpu\', \'zone\', \'project\', \'job_name\', \'coordinator_name\', \'coordinator_address\', \'credentials\', \'service\', \'discovery_url\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'worker\', \'None\', \'None\', \'default\', \'None\', \'None\'], "
@@ -15,6 +23,10 @@ tf_class {
     name: "cluster_spec"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "connect"
+    argspec: "args=[\'tpu\', \'zone\', \'project\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
   member_method {
     name: "get_job_name"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-central-storage-strategy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-central-storage-strategy.pbtxt
index fbc4c107a1a..3c3d785ac7c 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-central-storage-strategy.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-central-storage-strategy.pbtxt
@@ -4,6 +4,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.distribute.distribute_lib.StrategyV1\'>"
   is_instance: "<class \'tensorflow.python.distribute.distribute_lib.StrategyBase\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "cluster_resolver"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "extended"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-multi-worker-mirrored-strategy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-multi-worker-mirrored-strategy.pbtxt
index cd67e7d27c4..e1f8bea251b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-multi-worker-mirrored-strategy.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-multi-worker-mirrored-strategy.pbtxt
@@ -4,6 +4,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.distribute.distribute_lib.StrategyV1\'>"
   is_instance: "<class \'tensorflow.python.distribute.distribute_lib.StrategyBase\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "cluster_resolver"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "extended"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-parameter-server-strategy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-parameter-server-strategy.pbtxt
index 0eff82474ff..6ae83d18589 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-parameter-server-strategy.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-parameter-server-strategy.pbtxt
@@ -4,6 +4,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.distribute.distribute_lib.StrategyV1\'>"
   is_instance: "<class \'tensorflow.python.distribute.distribute_lib.StrategyBase\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "cluster_resolver"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "extended"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-t-p-u-strategy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-t-p-u-strategy.pbtxt
index 2af9a5ad095..0e548eca9b5 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-t-p-u-strategy.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-t-p-u-strategy.pbtxt
@@ -4,6 +4,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.distribute.distribute_lib.StrategyV1\'>"
   is_instance: "<class \'tensorflow.python.distribute.distribute_lib.StrategyBase\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "cluster_resolver"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "extended"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.-model.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.-model.pbtxt
index b62814e81cb..6318e577087 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.-model.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.-model.pbtxt
@@ -302,7 +302,7 @@ tf_class {
   }
   member_method {
     name: "save_weights"
-    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'save_format\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
+    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'save_format\', \'options\'], varargs=None, keywords=None, defaults=[\'True\', \'None\', \'None\'], "
   }
   member_method {
     name: "set_weights"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.-sequential.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.-sequential.pbtxt
index 7485a0b3c62..9b7b7736746 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.-sequential.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.-sequential.pbtxt
@@ -320,7 +320,7 @@ tf_class {
   }
   member_method {
     name: "save_weights"
-    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'save_format\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
+    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'save_format\', \'options\'], varargs=None, keywords=None, defaults=[\'True\', \'None\', \'None\'], "
   }
   member_method {
     name: "set_weights"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-model-checkpoint.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-model-checkpoint.pbtxt
index 5fb646e1c63..e6cc7aee5a8 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-model-checkpoint.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-model-checkpoint.pbtxt
@@ -5,7 +5,7 @@ tf_class {
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'filepath\', \'monitor\', \'verbose\', \'save_best_only\', \'save_weights_only\', \'mode\', \'save_freq\'], varargs=None, keywords=kwargs, defaults=[\'val_loss\', \'0\', \'False\', \'False\', \'auto\', \'epoch\'], "
+    argspec: "args=[\'self\', \'filepath\', \'monitor\', \'verbose\', \'save_best_only\', \'save_weights_only\', \'mode\', \'save_freq\', \'options\'], varargs=None, keywords=kwargs, defaults=[\'val_loss\', \'0\', \'False\', \'False\', \'auto\', \'epoch\', \'None\'], "
   }
   member_method {
     name: "on_batch_begin"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-linear-model.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-linear-model.pbtxt
index bf980e5d116..976eb49d4c8 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-linear-model.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-linear-model.pbtxt
@@ -303,7 +303,7 @@ tf_class {
   }
   member_method {
     name: "save_weights"
-    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'save_format\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
+    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'save_format\', \'options\'], varargs=None, keywords=None, defaults=[\'True\', \'None\', \'None\'], "
   }
   member_method {
     name: "set_weights"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-wide-deep-model.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-wide-deep-model.pbtxt
index c214a5c3419..500aa28eae7 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-wide-deep-model.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-wide-deep-model.pbtxt
@@ -303,7 +303,7 @@ tf_class {
   }
   member_method {
     name: "save_weights"
-    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'save_format\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
+    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'save_format\', \'options\'], varargs=None, keywords=None, defaults=[\'True\', \'None\', \'None\'], "
   }
   member_method {
     name: "set_weights"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
index f77d613e354..958d06a0d0f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
@@ -207,7 +207,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'activation\', \'recurrent_activation\', \'use_bias\', \'kernel_initializer\', \'recurrent_initializer\', \'bias_initializer\', \'unit_forget_bias\', \'kernel_regularizer\', \'recurrent_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'recurrent_constraint\', \'bias_constraint\', \'return_sequences\', \'go_backwards\', \'stateful\', \'dropout\', \'recurrent_dropout\'], varargs=None, keywords=kwargs, defaults=[\'(1, 1)\', \'valid\', \'None\', \'(1, 1)\', \'tanh\', \'hard_sigmoid\', \'True\', \'glorot_uniform\', \'orthogonal\', \'zeros\', \'True\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'False\', \'False\', \'False\', \'0.0\', \'0.0\'], "
+    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'activation\', \'recurrent_activation\', \'use_bias\', \'kernel_initializer\', \'recurrent_initializer\', \'bias_initializer\', \'unit_forget_bias\', \'kernel_regularizer\', \'recurrent_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'recurrent_constraint\', \'bias_constraint\', \'return_sequences\', \'return_state\', \'go_backwards\', \'stateful\', \'dropout\', \'recurrent_dropout\'], varargs=None, keywords=kwargs, defaults=[\'(1, 1)\', \'valid\', \'None\', \'(1, 1)\', \'tanh\', \'hard_sigmoid\', \'True\', \'glorot_uniform\', \'orthogonal\', \'zeros\', \'True\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'False\', \'False\', \'False\', \'False\', \'0.0\', \'0.0\'], "
   }
   member_method {
     name: "add_loss"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-model.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-model.pbtxt
index 86868c9d17f..ad0edc64606 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-model.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-model.pbtxt
@@ -302,7 +302,7 @@ tf_class {
   }
   member_method {
     name: "save_weights"
-    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'save_format\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
+    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'save_format\', \'options\'], varargs=None, keywords=None, defaults=[\'True\', \'None\', \'None\'], "
   }
   member_method {
     name: "set_weights"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-sequential.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-sequential.pbtxt
index 05aa19a915a..b38c669df0f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-sequential.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-sequential.pbtxt
@@ -320,7 +320,7 @@ tf_class {
   }
   member_method {
     name: "save_weights"
-    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'save_format\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
+    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'save_format\', \'options\'], varargs=None, keywords=None, defaults=[\'True\', \'None\', \'None\'], "
   }
   member_method {
     name: "set_weights"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.lite.-ops-set.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.lite.-ops-set.pbtxt
index c3199b24d98..9538fe382a0 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.lite.-ops-set.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.lite.-ops-set.pbtxt
@@ -1,6 +1,10 @@
 path: "tensorflow.lite.OpsSet"
 tf_class {
   is_instance: "<enum \'OpsSet\'>"
+  member {
+    name: "EXPERIMENTAL_TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8"
+    mtype: "<enum \'OpsSet\'>"
+  }
   member {
     name: "SELECT_TF_OPS"
     mtype: "<enum \'OpsSet\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.lite.constants.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.lite.constants.pbtxt
index 27c227dac64..7f62da6662a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.lite.constants.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.lite.constants.pbtxt
@@ -12,6 +12,10 @@ tf_module {
     name: "GRAPHVIZ_DOT"
     mtype: "<type \'int\'>"
   }
+  member {
+    name: "INT16"
+    mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
+  }
   member {
     name: "INT32"
     mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.math.special.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.math.special.pbtxt
index 6afc63c7f94..7fd93029924 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.math.special.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.math.special.pbtxt
@@ -1,5 +1,53 @@
 path: "tensorflow.math.special"
 tf_module {
+  member_method {
+    name: "bessel_i0"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "bessel_i0e"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "bessel_i1"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "bessel_i1e"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "bessel_j0"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "bessel_j1"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "bessel_k0"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "bessel_k0e"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "bessel_k1"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "bessel_k1e"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "bessel_y0"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "bessel_y1"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "dawsn"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.random.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.random.pbtxt
index 9c6fa7154a3..f5963f1324c 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.random.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.random.pbtxt
@@ -92,6 +92,10 @@ tf_module {
     name: "stateless_normal"
     argspec: "args=[\'shape\', \'seed\', \'mean\', \'stddev\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'0.0\', \'1.0\', \"<dtype: \'float32\'>\", \'None\'], "
   }
+  member_method {
+    name: "stateless_parameterized_truncated_normal"
+    argspec: "args=[\'shape\', \'seed\', \'means\', \'stddevs\', \'minvals\', \'maxvals\', \'name\'], varargs=None, keywords=None, defaults=[\'0.0\', \'1.0\', \'-2.0\', \'2.0\', \'None\'], "
+  }
   member_method {
     name: "stateless_poisson"
     argspec: "args=[\'shape\', \'seed\', \'lam\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'int32\'>\", \'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
index 54d15b601c5..1d27408735a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
@@ -284,6 +284,10 @@ tf_module {
     name: "AvgPoolGrad"
     argspec: "args=[\'orig_input_shape\', \'grad\', \'ksize\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'None\'], "
   }
+  member_method {
+    name: "BandedTriangularSolve"
+    argspec: "args=[\'matrix\', \'rhs\', \'lower\', \'adjoint\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'False\', \'None\'], "
+  }
   member_method {
     name: "Barrier"
     argspec: "args=[\'component_types\', \'shapes\', \'capacity\', \'container\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'[]\', \'-1\', \'\', \'\', \'None\'], "
@@ -428,14 +432,54 @@ tf_module {
     name: "BatchToSpaceND"
     argspec: "args=[\'input\', \'block_shape\', \'crops\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "BesselI0"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "BesselI0e"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "BesselI1"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "BesselI1e"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "BesselJ0"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "BesselJ1"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "BesselK0"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "BesselK0e"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "BesselK1"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "BesselK1e"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "BesselY0"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "BesselY1"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "Betainc"
     argspec: "args=[\'a\', \'b\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -1020,6 +1064,10 @@ tf_module {
     name: "DecodeGif"
     argspec: "args=[\'contents\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "DecodeImage"
+    argspec: "args=[\'contents\', \'channels\', \'dtype\', \'expand_animations\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \"<dtype: \'uint8\'>\", \'True\', \'None\'], "
+  }
   member_method {
     name: "DecodeJSONExample"
     argspec: "args=[\'json_examples\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -4448,6 +4496,10 @@ tf_module {
     name: "StatelessMultinomial"
     argspec: "args=[\'logits\', \'num_samples\', \'seed\', \'output_dtype\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'int64\'>\", \'None\'], "
   }
+  member_method {
+    name: "StatelessParameterizedTruncatedNormal"
+    argspec: "args=[\'shape\', \'seed\', \'means\', \'stddevs\', \'minvals\', \'maxvals\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "StatelessRandomBinomial"
     argspec: "args=[\'shape\', \'seed\', \'counts\', \'probs\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'int64\'>\", \'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-distributed-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-distributed-dataset.pbtxt
new file mode 100644
index 00000000000..a7b229c6c7c
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-distributed-dataset.pbtxt
@@ -0,0 +1,16 @@
+path: "tensorflow.distribute.DistributedDataset"
+tf_class {
+  is_instance: "<class \'tensorflow.python.distribute.input_lib.DistributedDatasetInterface\'>"
+  is_instance: "<class \'collections.abc.Iterable\'>"
+  member {
+    name: "element_spec"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "reduce"
+    argspec: "args=[\'self\', \'initial_state\', \'reduce_func\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-distributed-iterator.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-distributed-iterator.pbtxt
new file mode 100644
index 00000000000..47899cc4188
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-distributed-iterator.pbtxt
@@ -0,0 +1,20 @@
+path: "tensorflow.distribute.DistributedIterator"
+tf_class {
+  is_instance: "<class \'tensorflow.python.distribute.input_lib.DistributedIteratorInterface\'>"
+  is_instance: "<class \'collections.abc.Iterator\'>"
+  member {
+    name: "element_spec"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "get_next"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_next_as_optional"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-mirrored-strategy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-mirrored-strategy.pbtxt
index be4c841aed7..8817f16d808 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-mirrored-strategy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-mirrored-strategy.pbtxt
@@ -4,6 +4,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.distribute.distribute_lib.Strategy\'>"
   is_instance: "<class \'tensorflow.python.distribute.distribute_lib.StrategyBase\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "cluster_resolver"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "extended"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-one-device-strategy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-one-device-strategy.pbtxt
index 4557fe1060b..992243ffe8a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-one-device-strategy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-one-device-strategy.pbtxt
@@ -4,6 +4,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.distribute.distribute_lib.Strategy\'>"
   is_instance: "<class \'tensorflow.python.distribute.distribute_lib.StrategyBase\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "cluster_resolver"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "extended"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-strategy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-strategy.pbtxt
index 9f6a2ac32be..8140088e701 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-strategy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-strategy.pbtxt
@@ -3,6 +3,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.distribute.distribute_lib.Strategy\'>"
   is_instance: "<class \'tensorflow.python.distribute.distribute_lib.StrategyBase\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "cluster_resolver"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "extended"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-t-p-u-strategy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-t-p-u-strategy.pbtxt
new file mode 100644
index 00000000000..29947a1c9c5
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-t-p-u-strategy.pbtxt
@@ -0,0 +1,103 @@
+path: "tensorflow.distribute.TPUStrategy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.distribute.tpu_strategy.TPUStrategyV2\'>"
+  is_instance: "<class \'tensorflow.python.distribute.distribute_lib.Strategy\'>"
+  is_instance: "<class \'tensorflow.python.distribute.distribute_lib.StrategyBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "cluster_resolver"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "extended"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "num_replicas_in_sync"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'tpu_cluster_resolver\', \'experimental_device_assignment\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "colocate_vars_with"
+    argspec: "args=[\'self\', \'colocate_with_variable\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "configure"
+    argspec: "args=[\'self\', \'session_config\', \'cluster_spec\', \'task_type\', \'task_id\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "experimental_assign_to_logical_device"
+    argspec: "args=[\'self\', \'tensor\', \'logical_device_id\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "experimental_distribute_dataset"
+    argspec: "args=[\'self\', \'dataset\', \'options\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "experimental_distribute_datasets_from_function"
+    argspec: "args=[\'self\', \'dataset_fn\', \'options\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "experimental_distribute_values_from_function"
+    argspec: "args=[\'self\', \'value_fn\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "experimental_local_results"
+    argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "experimental_make_numpy_dataset"
+    argspec: "args=[\'self\', \'numpy_input\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "experimental_replicate_to_logical_devices"
+    argspec: "args=[\'self\', \'tensor\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "experimental_run"
+    argspec: "args=[\'self\', \'fn\', \'input_iterator\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "experimental_run_v2"
+    argspec: "args=[\'self\', \'fn\', \'args\', \'kwargs\', \'options\'], varargs=None, keywords=None, defaults=[\'()\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "experimental_split_to_logical_devices"
+    argspec: "args=[\'self\', \'tensor\', \'partition_dimensions\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "group"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "make_dataset_iterator"
+    argspec: "args=[\'self\', \'dataset\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "make_input_fn_iterator"
+    argspec: "args=[\'self\', \'input_fn\', \'replication_mode\'], varargs=None, keywords=None, defaults=[\'InputReplicationMode.PER_WORKER\'], "
+  }
+  member_method {
+    name: "reduce"
+    argspec: "args=[\'self\', \'reduce_op\', \'value\', \'axis\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "run"
+    argspec: "args=[\'self\', \'fn\', \'args\', \'kwargs\', \'options\'], varargs=None, keywords=None, defaults=[\'()\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "scope"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "unwrap"
+    argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_config_proto"
+    argspec: "args=[\'self\', \'config_proto\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.cluster_resolver.-cluster-resolver.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.cluster_resolver.-cluster-resolver.pbtxt
index 0b35b61b4c0..c9dbca7368e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.distribute.cluster_resolver.-cluster-resolver.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.cluster_resolver.-cluster-resolver.pbtxt
@@ -6,6 +6,14 @@ tf_class {
     name: "environment"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "task_id"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "task_type"
+    mtype: "<type \'property\'>"
+  }
   member_method {
     name: "__init__"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.cluster_resolver.-kubernetes-cluster-resolver.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.cluster_resolver.-kubernetes-cluster-resolver.pbtxt
index 3220d68e054..2819ca85612 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.distribute.cluster_resolver.-kubernetes-cluster-resolver.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.cluster_resolver.-kubernetes-cluster-resolver.pbtxt
@@ -7,6 +7,14 @@ tf_class {
     name: "environment"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "task_id"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "task_type"
+    mtype: "<type \'property\'>"
+  }
   member_method {
     name: "__init__"
     argspec: "args=[\'self\', \'job_to_label_mapping\', \'tf_server_port\', \'rpc_layer\', \'override_client\'], varargs=None, keywords=None, defaults=[\'None\', \'8470\', \'grpc\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.cluster_resolver.-slurm-cluster-resolver.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.cluster_resolver.-slurm-cluster-resolver.pbtxt
index 30cfac0830c..a2c63243c86 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.distribute.cluster_resolver.-slurm-cluster-resolver.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.cluster_resolver.-slurm-cluster-resolver.pbtxt
@@ -7,6 +7,14 @@ tf_class {
     name: "environment"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "task_id"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "task_type"
+    mtype: "<type \'property\'>"
+  }
   member_method {
     name: "__init__"
     argspec: "args=[\'self\', \'jobs\', \'port_base\', \'gpus_per_node\', \'gpus_per_task\', \'tasks_per_node\', \'auto_set_gpu\', \'rpc_layer\'], varargs=None, keywords=None, defaults=[\'None\', \'8888\', \'None\', \'None\', \'None\', \'True\', \'grpc\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.cluster_resolver.-t-p-u-cluster-resolver.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.cluster_resolver.-t-p-u-cluster-resolver.pbtxt
index 658212aca5e..165f8d43ace 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.distribute.cluster_resolver.-t-p-u-cluster-resolver.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.cluster_resolver.-t-p-u-cluster-resolver.pbtxt
@@ -7,6 +7,14 @@ tf_class {
     name: "environment"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "task_id"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "task_type"
+    mtype: "<type \'property\'>"
+  }
   member_method {
     name: "__init__"
     argspec: "args=[\'self\', \'tpu\', \'zone\', \'project\', \'job_name\', \'coordinator_name\', \'coordinator_address\', \'credentials\', \'service\', \'discovery_url\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'worker\', \'None\', \'None\', \'default\', \'None\', \'None\'], "
@@ -15,6 +23,10 @@ tf_class {
     name: "cluster_spec"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "connect"
+    argspec: "args=[\'tpu\', \'zone\', \'project\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
   member_method {
     name: "get_job_name"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-central-storage-strategy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-central-storage-strategy.pbtxt
index dd61960c66f..695fb52358b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-central-storage-strategy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-central-storage-strategy.pbtxt
@@ -4,6 +4,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.distribute.distribute_lib.Strategy\'>"
   is_instance: "<class \'tensorflow.python.distribute.distribute_lib.StrategyBase\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "cluster_resolver"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "extended"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-multi-worker-mirrored-strategy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-multi-worker-mirrored-strategy.pbtxt
index 500ae362e5f..43632e17b6d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-multi-worker-mirrored-strategy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-multi-worker-mirrored-strategy.pbtxt
@@ -4,6 +4,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.distribute.distribute_lib.Strategy\'>"
   is_instance: "<class \'tensorflow.python.distribute.distribute_lib.StrategyBase\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "cluster_resolver"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "extended"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-parameter-server-strategy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-parameter-server-strategy.pbtxt
index d6dc9627d9a..39181625469 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-parameter-server-strategy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-parameter-server-strategy.pbtxt
@@ -4,6 +4,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.distribute.distribute_lib.Strategy\'>"
   is_instance: "<class \'tensorflow.python.distribute.distribute_lib.StrategyBase\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "cluster_resolver"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "extended"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-t-p-u-strategy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-t-p-u-strategy.pbtxt
index 82a4362a597..855cdbfb175 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-t-p-u-strategy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-t-p-u-strategy.pbtxt
@@ -4,6 +4,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.distribute.distribute_lib.Strategy\'>"
   is_instance: "<class \'tensorflow.python.distribute.distribute_lib.StrategyBase\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "cluster_resolver"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "extended"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.pbtxt
index 19d83909120..d3867889a4f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.distribute.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.pbtxt
@@ -4,6 +4,14 @@ tf_module {
     name: "CrossDeviceOps"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "DistributedDataset"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "DistributedIterator"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "DistributedValues"
     mtype: "<type \'type\'>"
@@ -64,6 +72,10 @@ tf_module {
     name: "StrategyExtended"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "TPUStrategy"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "cluster_resolver"
     mtype: "<type \'module\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.-model.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.-model.pbtxt
index b62814e81cb..6318e577087 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.-model.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.-model.pbtxt
@@ -302,7 +302,7 @@ tf_class {
   }
   member_method {
     name: "save_weights"
-    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'save_format\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
+    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'save_format\', \'options\'], varargs=None, keywords=None, defaults=[\'True\', \'None\', \'None\'], "
   }
   member_method {
     name: "set_weights"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.-sequential.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.-sequential.pbtxt
index 7485a0b3c62..9b7b7736746 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.-sequential.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.-sequential.pbtxt
@@ -320,7 +320,7 @@ tf_class {
   }
   member_method {
     name: "save_weights"
-    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'save_format\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
+    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'save_format\', \'options\'], varargs=None, keywords=None, defaults=[\'True\', \'None\', \'None\'], "
   }
   member_method {
     name: "set_weights"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-model-checkpoint.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-model-checkpoint.pbtxt
index 5fb646e1c63..e6cc7aee5a8 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-model-checkpoint.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-model-checkpoint.pbtxt
@@ -5,7 +5,7 @@ tf_class {
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'filepath\', \'monitor\', \'verbose\', \'save_best_only\', \'save_weights_only\', \'mode\', \'save_freq\'], varargs=None, keywords=kwargs, defaults=[\'val_loss\', \'0\', \'False\', \'False\', \'auto\', \'epoch\'], "
+    argspec: "args=[\'self\', \'filepath\', \'monitor\', \'verbose\', \'save_best_only\', \'save_weights_only\', \'mode\', \'save_freq\', \'options\'], varargs=None, keywords=kwargs, defaults=[\'val_loss\', \'0\', \'False\', \'False\', \'auto\', \'epoch\', \'None\'], "
   }
   member_method {
     name: "on_batch_begin"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-linear-model.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-linear-model.pbtxt
index bf980e5d116..976eb49d4c8 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-linear-model.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-linear-model.pbtxt
@@ -303,7 +303,7 @@ tf_class {
   }
   member_method {
     name: "save_weights"
-    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'save_format\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
+    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'save_format\', \'options\'], varargs=None, keywords=None, defaults=[\'True\', \'None\', \'None\'], "
   }
   member_method {
     name: "set_weights"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-wide-deep-model.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-wide-deep-model.pbtxt
index c214a5c3419..500aa28eae7 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-wide-deep-model.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-wide-deep-model.pbtxt
@@ -303,7 +303,7 @@ tf_class {
   }
   member_method {
     name: "save_weights"
-    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'save_format\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
+    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'save_format\', \'options\'], varargs=None, keywords=None, defaults=[\'True\', \'None\', \'None\'], "
   }
   member_method {
     name: "set_weights"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
index f77d613e354..958d06a0d0f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
@@ -207,7 +207,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'activation\', \'recurrent_activation\', \'use_bias\', \'kernel_initializer\', \'recurrent_initializer\', \'bias_initializer\', \'unit_forget_bias\', \'kernel_regularizer\', \'recurrent_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'recurrent_constraint\', \'bias_constraint\', \'return_sequences\', \'go_backwards\', \'stateful\', \'dropout\', \'recurrent_dropout\'], varargs=None, keywords=kwargs, defaults=[\'(1, 1)\', \'valid\', \'None\', \'(1, 1)\', \'tanh\', \'hard_sigmoid\', \'True\', \'glorot_uniform\', \'orthogonal\', \'zeros\', \'True\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'False\', \'False\', \'False\', \'0.0\', \'0.0\'], "
+    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'activation\', \'recurrent_activation\', \'use_bias\', \'kernel_initializer\', \'recurrent_initializer\', \'bias_initializer\', \'unit_forget_bias\', \'kernel_regularizer\', \'recurrent_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'recurrent_constraint\', \'bias_constraint\', \'return_sequences\', \'return_state\', \'go_backwards\', \'stateful\', \'dropout\', \'recurrent_dropout\'], varargs=None, keywords=kwargs, defaults=[\'(1, 1)\', \'valid\', \'None\', \'(1, 1)\', \'tanh\', \'hard_sigmoid\', \'True\', \'glorot_uniform\', \'orthogonal\', \'zeros\', \'True\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'False\', \'False\', \'False\', \'False\', \'0.0\', \'0.0\'], "
   }
   member_method {
     name: "add_loss"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-model.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-model.pbtxt
index 86868c9d17f..ad0edc64606 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-model.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-model.pbtxt
@@ -302,7 +302,7 @@ tf_class {
   }
   member_method {
     name: "save_weights"
-    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'save_format\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
+    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'save_format\', \'options\'], varargs=None, keywords=None, defaults=[\'True\', \'None\', \'None\'], "
   }
   member_method {
     name: "set_weights"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-sequential.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-sequential.pbtxt
index 05aa19a915a..b38c669df0f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-sequential.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-sequential.pbtxt
@@ -320,7 +320,7 @@ tf_class {
   }
   member_method {
     name: "save_weights"
-    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'save_format\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
+    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'save_format\', \'options\'], varargs=None, keywords=None, defaults=[\'True\', \'None\', \'None\'], "
   }
   member_method {
     name: "set_weights"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.pbtxt
index 734837b99cb..4f62af20dc0 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.pbtxt
@@ -96,6 +96,10 @@ tf_module {
     name: "band_part"
     argspec: "args=[\'input\', \'num_lower\', \'num_upper\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "banded_triangular_solve"
+    argspec: "args=[\'bands\', \'rhs\', \'lower\', \'adjoint\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'False\', \'None\'], "
+  }
   member_method {
     name: "cholesky"
     argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.lite.-ops-set.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.lite.-ops-set.pbtxt
index c3199b24d98..9538fe382a0 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.lite.-ops-set.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.lite.-ops-set.pbtxt
@@ -1,6 +1,10 @@
 path: "tensorflow.lite.OpsSet"
 tf_class {
   is_instance: "<enum \'OpsSet\'>"
+  member {
+    name: "EXPERIMENTAL_TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8"
+    mtype: "<enum \'OpsSet\'>"
+  }
   member {
     name: "SELECT_TF_OPS"
     mtype: "<enum \'OpsSet\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.math.special.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.math.special.pbtxt
index 6afc63c7f94..7fd93029924 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.math.special.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.math.special.pbtxt
@@ -1,5 +1,53 @@
 path: "tensorflow.math.special"
 tf_module {
+  member_method {
+    name: "bessel_i0"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "bessel_i0e"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "bessel_i1"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "bessel_i1e"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "bessel_j0"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "bessel_j1"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "bessel_k0"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "bessel_k0e"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "bessel_k1"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "bessel_k1e"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "bessel_y0"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "bessel_y1"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "dawsn"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.random.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.random.pbtxt
index e3a11ee4610..d1b8c90bfae 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.random.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.random.pbtxt
@@ -80,6 +80,10 @@ tf_module {
     name: "stateless_normal"
     argspec: "args=[\'shape\', \'seed\', \'mean\', \'stddev\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'0.0\', \'1.0\', \"<dtype: \'float32\'>\", \'None\'], "
   }
+  member_method {
+    name: "stateless_parameterized_truncated_normal"
+    argspec: "args=[\'shape\', \'seed\', \'means\', \'stddevs\', \'minvals\', \'maxvals\', \'name\'], varargs=None, keywords=None, defaults=[\'0.0\', \'1.0\', \'-2.0\', \'2.0\', \'None\'], "
+  }
   member_method {
     name: "stateless_poisson"
     argspec: "args=[\'shape\', \'seed\', \'lam\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'int32\'>\", \'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
index 54d15b601c5..1d27408735a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
@@ -284,6 +284,10 @@ tf_module {
     name: "AvgPoolGrad"
     argspec: "args=[\'orig_input_shape\', \'grad\', \'ksize\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'None\'], "
   }
+  member_method {
+    name: "BandedTriangularSolve"
+    argspec: "args=[\'matrix\', \'rhs\', \'lower\', \'adjoint\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'False\', \'None\'], "
+  }
   member_method {
     name: "Barrier"
     argspec: "args=[\'component_types\', \'shapes\', \'capacity\', \'container\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'[]\', \'-1\', \'\', \'\', \'None\'], "
@@ -428,14 +432,54 @@ tf_module {
     name: "BatchToSpaceND"
     argspec: "args=[\'input\', \'block_shape\', \'crops\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "BesselI0"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "BesselI0e"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "BesselI1"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "BesselI1e"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "BesselJ0"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "BesselJ1"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "BesselK0"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "BesselK0e"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "BesselK1"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "BesselK1e"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "BesselY0"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "BesselY1"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "Betainc"
     argspec: "args=[\'a\', \'b\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -1020,6 +1064,10 @@ tf_module {
     name: "DecodeGif"
     argspec: "args=[\'contents\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "DecodeImage"
+    argspec: "args=[\'contents\', \'channels\', \'dtype\', \'expand_animations\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \"<dtype: \'uint8\'>\", \'True\', \'None\'], "
+  }
   member_method {
     name: "DecodeJSONExample"
     argspec: "args=[\'json_examples\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -4448,6 +4496,10 @@ tf_module {
     name: "StatelessMultinomial"
     argspec: "args=[\'logits\', \'num_samples\', \'seed\', \'output_dtype\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'int64\'>\", \'None\'], "
   }
+  member_method {
+    name: "StatelessParameterizedTruncatedNormal"
+    argspec: "args=[\'shape\', \'seed\', \'means\', \'stddevs\', \'minvals\', \'maxvals\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "StatelessRandomBinomial"
     argspec: "args=[\'shape\', \'seed\', \'counts\', \'probs\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'int64\'>\", \'None\'], "
diff --git a/tensorflow/tools/ci_build/Dockerfile.rbe.cuda11.0-cudnn8-ubuntu18.04-manylinux2010-multipython b/tensorflow/tools/ci_build/Dockerfile.rbe.cuda11.0-cudnn8-ubuntu18.04-manylinux2010-multipython
new file mode 100644
index 00000000000..cd841a77aba
--- /dev/null
+++ b/tensorflow/tools/ci_build/Dockerfile.rbe.cuda11.0-cudnn8-ubuntu18.04-manylinux2010-multipython
@@ -0,0 +1,97 @@
+# Dockerfile to build a manylinux 2010 compliant cross-compiler.
+#
+# Builds a devtoolset gcc/libstdc++ that targets manylinux 2010 compatible
+# glibc (2.12) and system libstdc++ (4.4).
+#
+# To push a new version, run:
+# $ docker build -f Dockerfile.rbe.cuda11.0-cudnn8-ubuntu18.04-manylinux2010-multipython \
+#  --tag "gcr.io/tensorflow-testing/nosla-cuda11.0-cudnn8-ubuntu18.04-manylinux2010-multipython" .
+# $ docker push gcr.io/tensorflow-testing/nosla-cuda11.0-cudnn8-ubuntu18.04-manylinux2010-multipython
+
+FROM nvidia/cuda:11.0-cudnn8-devel-ubuntu18.04-rc as devtoolset
+
+ENV DEBIAN_FRONTEND=noninteractive
+RUN apt-get update && apt-get install -y \
+      cpio \
+      file \
+      flex \
+      g++ \
+      make \
+      patch \
+      rpm2cpio \
+      unar \
+      wget \
+      xz-utils \
+      && \
+    rm -rf /var/lib/apt/lists/*
+
+ADD devtoolset/fixlinks.sh fixlinks.sh
+ADD devtoolset/build_devtoolset.sh build_devtoolset.sh
+ADD devtoolset/rpm-patch.sh rpm-patch.sh
+
+# Set up a sysroot for glibc 2.12 / libstdc++ 4.4 / devtoolset-7 in /dt7.
+RUN /build_devtoolset.sh devtoolset-7 /dt7
+# Set up a sysroot for glibc 2.12 / libstdc++ 4.4 / devtoolset-8 in /dt8.
+RUN /build_devtoolset.sh devtoolset-8 /dt8
+
+# TODO(klimek): Split up into two different docker images.
+FROM nvidia/cuda:11.0-cudnn8-devel-ubuntu18.04-rc
+COPY --from=devtoolset /dt7 /dt7
+COPY --from=devtoolset /dt8 /dt8
+
+# Install TensorRT.
+RUN echo \
+    deb https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1604/x86_64 / \
+    > /etc/apt/sources.list.d/nvidia-ml.list \
+      && \
+    apt-get update && apt-get install -y \
+    libnvinfer-dev=7.1.3-1+cuda11.0 \
+    libnvinfer7=7.1.3-1+cuda11.0 \
+    libnvinfer-plugin-dev=7.1.3-1+cuda11.0 \
+    libnvinfer-plugin7=7.1.3-1+cuda11.0 \
+      && \
+    rm -rf /var/lib/apt/lists/*
+
+# Copy and run the install scripts.
+ARG DEBIAN_FRONTEND=noninteractive
+
+COPY install/install_bootstrap_deb_packages.sh /install/
+RUN /install/install_bootstrap_deb_packages.sh
+
+COPY install/install_deb_packages.sh /install/
+RUN /install/install_deb_packages.sh
+
+# Install additional packages needed for this image:
+# - dependencies to build Python from source
+# - patchelf, as it is required by auditwheel
+RUN apt-get update && apt-get install -y \
+    libbz2-dev \
+    libffi-dev \
+    libgdbm-dev \
+    libncurses5-dev \
+    libnss3-dev \
+    libreadline-dev \
+    patchelf \
+      && \
+    rm -rf /var/lib/apt/lists/*
+
+COPY install/install_bazel.sh /install/
+RUN /install/install_bazel.sh
+
+COPY install/build_and_install_python.sh /install/
+RUN /install/build_and_install_python.sh "2.7.17" "--enable-unicode=ucs4"
+RUN /install/build_and_install_python.sh "3.5.9"
+RUN /install/build_and_install_python.sh "3.6.9"
+RUN /install/build_and_install_python.sh "3.7.7"
+RUN /install/build_and_install_python.sh "3.8.2"
+
+COPY install/install_pip_packages_by_version.sh /install/
+RUN /install/install_pip_packages_by_version.sh "/usr/local/bin/pip2.7"
+RUN /install/install_pip_packages_by_version.sh "/usr/local/bin/pip3.8"
+RUN /install/install_pip_packages_by_version.sh "/usr/local/bin/pip3.5"
+RUN /install/install_pip_packages_by_version.sh "/usr/local/bin/pip3.6"
+RUN /install/install_pip_packages_by_version.sh "/usr/local/bin/pip3.7"
+
+ENV CLANG_VERSION="r42cab985fd95ba4f3f290e7bb26b93805edb447d"
+COPY install/install_latest_clang.sh /install/
+RUN /install/install_latest_clang.sh
diff --git a/tensorflow/tools/ci_build/linux/mkl/Dockerfile.devel-mkl b/tensorflow/tools/ci_build/linux/mkl/Dockerfile.devel-mkl
index 45ccf67d707..80091e55a17 100755
--- a/tensorflow/tools/ci_build/linux/mkl/Dockerfile.devel-mkl
+++ b/tensorflow/tools/ci_build/linux/mkl/Dockerfile.devel-mkl
@@ -15,6 +15,13 @@ ARG CONFIG_BFLOAT16_BUILD=""
 ARG ENABLE_SECURE_BUILD
 ARG BAZEL_VERSION=""
 ARG ENABLE_DNNL1=""
+ARG ENABLE_HOROVOD=""
+ARG OPENMPI_VERSION=""
+ARG OPENMPI_DOWNLOAD_URL=""
+ARG HOROVOD_VERSION=""
+ARG TF_NIGHTLY_FLAG=""
+
+ENV DEBIAN_FRONTEND=noninteractive
 
 # Upgrade Bazel version if argument is passed
 RUN if [ "${BAZEL_VERSION}" != "" ]; then \
@@ -50,11 +57,19 @@ RUN ${PIP} install future>=0.17.1
 
 RUN bazel --bazelrc=/root/.bazelrc build -c opt \
     tensorflow/tools/pip_package:build_pip_package && \
-    bazel-bin/tensorflow/tools/pip_package/build_pip_package "${WHL_DIR}" && \
-    ${PIP} --no-cache-dir install --upgrade "${WHL_DIR}"/tensorflow-*.whl && \
+    bazel-bin/tensorflow/tools/pip_package/build_pip_package "${TF_NIGHTLY_FLAG}" "${WHL_DIR}" && \
+    ${PIP} --no-cache-dir install --upgrade "${WHL_DIR}"/*.whl && \
     rm -rf /root/.cache
     # Clean up Bazel cache when done.
 
+# Install OpenMPI/Horovod
+COPY install_openmpi_horovod.sh .
+RUN if [ "${ENABLE_HOROVOD}" = "yes" ]; then \
+        chmod +x install_openmpi_horovod.sh && \
+        OPENMPI_VERSION=${OPENMPI_VERSION} OPENMPI_DOWNLOAD_URL=${OPENMPI_DOWNLOAD_URL} HOROVOD_VERSION=${HOROVOD_VERSION} ./install_openmpi_horovod.sh && \
+        rm -rf install_openmpi_horovod.sh; \
+    fi
+
 # TensorBoard
 EXPOSE 6006
 # IPython
diff --git a/tensorflow/tools/ci_build/linux/mkl/build-dev-container.sh b/tensorflow/tools/ci_build/linux/mkl/build-dev-container.sh
index eceef65aa38..a0880b0e51c 100755
--- a/tensorflow/tools/ci_build/linux/mkl/build-dev-container.sh
+++ b/tensorflow/tools/ci_build/linux/mkl/build-dev-container.sh
@@ -64,6 +64,11 @@ ENABLE_SECURE_BUILD=${ENABLE_SECURE_BUILD:-no}
 BAZEL_VERSION=${BAZEL_VERSION}
 BUILD_PY2_CONTAINERS=${BUILD_PY2_CONTAINERS:-no}
 ENABLE_DNNL1=${ENABLE_DNNL1:-no}
+ENABLE_HOROVOD=${ENABLE_HOROVOD:-no}
+OPENMPI_VERSION=${OPENMPI_VERSION}
+OPENMPI_DOWNLOAD_URL=${OPENMPI_DOWNLOAD_URL}
+HOROVOD_VERSION=${HOROVOD_VERSION}
+IS_NIGHTLY=${IS_NIGHTLY:-no}
 
 debug "ROOT_CONTAINER=${ROOT_CONTAINER}"
 debug "TF_ROOT_CONTAINER_TAG=${TF_ROOT_CONTAINER_TAG}"
@@ -82,6 +87,11 @@ debug "TMP_DIR=${TMP_DIR}"
 debug "BAZEL_VERSION=${BAZEL_VERSION}"
 debug "BUILD_PY2_CONTAINERS=${BUILD_PY2_CONTAINERS}"
 debug "ENABLE_DNNL1=${ENABLE_DNNL1}"
+debug "ENABLE_HOROVOD=${ENABLE_HOROVOD}"
+debug "OPENMPI_VERSION=${OPENMPI_VERSION}"
+debug "OPENMPI_DOWNLOAD_URL=${OPENMPI_DOWNLOAD_URL}"
+debug "HOROVOD_VERSION=${HOROVOD_VERSION}"
+debug "IS_NIGHTLY=${IS_NIGHTLY}"
 
 function build_container()
 {
@@ -131,6 +141,19 @@ function build_container()
     TF_DOCKER_BUILD_ARGS+=("--build-arg BAZEL_VERSION=${BAZEL_VERSION}")
   fi
 
+  # Add build arg for installing OpenMPI/Horovod
+  if [[ ${ENABLE_HOROVOD} == "yes" ]]; then
+    TF_DOCKER_BUILD_ARGS+=("--build-arg ENABLE_HOROVOD=${ENABLE_HOROVOD}")
+    TF_DOCKER_BUILD_ARGS+=("--build-arg OPENMPI_VERSION=${OPENMPI_VERSION}")
+    TF_DOCKER_BUILD_ARGS+=("--build-arg OPENMPI_DOWNLOAD_URL=${OPENMPI_DOWNLOAD_URL}")
+    TF_DOCKER_BUILD_ARGS+=("--build-arg HOROVOD_VERSION=${HOROVOD_VERSION}")
+  fi
+
+  # Add build arg --nightly_flag for the nightly build
+  if [[ ${IS_NIGHTLY} == "yes" ]]; then
+    TF_DOCKER_BUILD_ARGS+=("--build-arg TF_NIGHTLY_FLAG=--nightly_flag")
+  fi
+
   # Perform docker build
   debug "Building docker image with image name and tag: ${TEMP_IMAGE_NAME}"
   CMD="${DOCKER_BINARY} build ${TF_DOCKER_BUILD_ARGS[@]} --no-cache --pull -t ${TEMP_IMAGE_NAME} -f Dockerfile.devel-mkl ."
@@ -178,16 +201,27 @@ function test_container()
   debug "ID of the running docker container: ${CONTAINER_ID}"
 
   debug "Performing basic sanity checks on the running container..."
-  TEST_CMD_1=$(${DOCKER_BINARY} exec ${CONTAINER_ID} bash -c "${PYTHON} -c 'from tensorflow.python import _pywrap_util_port; print(_pywrap_util_port.IsMklEnabled())'")
-  # Make TEST_CMD backward compatible with older code
-  TEST_CMD_2=$(${DOCKER_BINARY} exec ${CONTAINER_ID} bash -c "${PYTHON} -c 'from tensorflow.python import pywrap_tensorflow; print(pywrap_tensorflow.IsMklEnabled())'")
+  {
+    ${DOCKER_BINARY} exec ${CONTAINER_ID} bash -c "${PYTHON} -c 'from tensorflow.python import _pywrap_util_port; print(_pywrap_util_port.IsMklEnabled())'"
+    echo "PASS: MKL enabled test in ${TEMP_IMAGE_NAME}"
+  } || {
+    ${DOCKER_BINARY} exec ${CONTAINER_ID} bash -c "${PYTHON} -c 'from tensorflow.python import pywrap_tensorflow; print(pywrap_tensorflow.IsMklEnabled())'"
+    echo "PASS: Old MKL enabled in ${TEMP_IMAGE_NAME}"
+  } || {
+    die "FAIL: MKL enabled test in ${TEMP_IMAGE_NAME}"
+  }
 
-  if [ "${TEST_CMD_1}" = "True" -o "${TEST_CMD_2}" = "True" ] ; then
-      echo "PASS: MKL enabled test in ${TEMP_IMAGE_NAME}"
-  else
-      die "FAIL: MKL enabled test in ${TEMP_IMAGE_NAME}"
+  # Test to check if horovod is installed successfully
+  if [[ ${ENABLE_HOROVOD} == "yes" ]]; then
+      debug "Test horovod in the container..."
+      ${DOCKER_BINARY} exec ${CONTAINER_ID} bash -c "${PYTHON} -c 'import horovod.tensorflow as hvd;'"
+      if [[ $? == "0" ]]; then
+          echo "PASS: HOROVOD installation test in ${TEMP_IMAGE_NAME}"
+      else
+          die "FAIL: HOROVOD installation test in ${TEMP_IMAGE_NAME}"
+      fi
   fi
-
+  
   # Stop the running docker container
   sleep 1
   "${DOCKER_BINARY}" stop --time=0 ${CONTAINER_ID}
diff --git a/tensorflow/tools/ci_build/linux/mkl/install_openmpi_horovod.sh b/tensorflow/tools/ci_build/linux/mkl/install_openmpi_horovod.sh
new file mode 100755
index 00000000000..9bc92ca4fef
--- /dev/null
+++ b/tensorflow/tools/ci_build/linux/mkl/install_openmpi_horovod.sh
@@ -0,0 +1,80 @@
+#!/usr/bin/env bash
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# Install OpenMPI, OpenSSH and Horovod during Intel(R) MKL container build
+# Usage: install_openmpi_horovod.sh [OPENMPI_VERSION=<openmpi version>] [OPENMPI_DOWNLOAD_URL=<openmpi download url>] 
+# [HOROVOD_VERSION=<horovod version>]
+
+set -e
+
+# Set default
+OPENMPI_VERSION=${OPENMPI_VERSION:-openmpi-2.1.1}
+OPENMPI_DOWNLOAD_URL=${OPENMPI_DOWNLOAD_URL:-https://www.open-mpi.org/software/ompi/v2.1/downloads/openmpi-2.1.1.tar.gz}
+HOROVOD_VERSION=${HOROVOD_VERSION:-0.19.1}
+
+# Install Open MPI
+echo "Installing OpenMPI version ${OPENMPI_VERSION} ..."
+echo "OpenMPI Download url ${OPENMPI_DOWNLOAD_URL} ..."
+
+mkdir /tmp/openmpi
+cd /tmp/openmpi
+curl -fSsL -O ${OPENMPI_DOWNLOAD_URL}
+tar zxf ${OPENMPI_VERSION}.tar.gz
+cd ${OPENMPI_VERSION}
+./configure --enable-mpirun-prefix-by-default
+make -j $(nproc) all
+make install
+ldconfig
+cd /
+rm -rf /tmp/openmpi
+
+# Create a wrapper for OpenMPI to allow running as root by default
+mv /usr/local/bin/mpirun /usr/local/bin/mpirun.real
+echo '#!/bin/bash' > /usr/local/bin/mpirun
+echo 'mpirun.real --allow-run-as-root "$@"' >> /usr/local/bin/mpirun
+chmod a+x /usr/local/bin/mpirun
+
+# Configure OpenMPI to run good defaults:
+echo "btl_tcp_if_exclude = lo,docker0" >> /usr/local/etc/openmpi-mca-params.conf
+
+# Check mpi version
+echo 'OpenMPI version:'
+mpirun --version
+
+# Install OpenSSH for MPI to communicate between containers
+apt-get clean && apt-get update && \
+    apt-get install -y --no-install-recommends --fix-missing \
+        openssh-client openssh-server libnuma-dev && \
+    rm -rf /var/lib/apt/lists/*
+if [[ $?  == "0" ]]; then
+    echo "PASS: OpenSSH installation"
+else
+    yum -y update && yum -y install numactl-devel openssh-server openssh-clients && \
+        yum clean all
+    if [[ $?  == "0" ]]; then
+        echo "PASS: OpenSSH installation"
+    else
+        echo "Unsupported Linux distribution. Aborting!" && exit 1
+    fi
+fi
+mkdir -p /var/run/sshd
+# Allow OpenSSH to talk to containers without asking for confirmation
+grep -v StrictHostKeyChecking /etc/ssh/ssh_config > /etc/ssh/ssh_config.new
+echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new
+mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config
+
+# Install Horovod
+HOROVOD_WITH_TENSORFLOW=1
+python3 -m pip install --no-cache-dir horovod==${HOROVOD_VERSION}
diff --git a/tensorflow/tools/ci_build/per_release/scripts/nonpip_gpu.sh b/tensorflow/tools/ci_build/per_release/scripts/nonpip_gpu.sh
new file mode 100644
index 00000000000..6fd7c3d5854
--- /dev/null
+++ b/tensorflow/tools/ci_build/per_release/scripts/nonpip_gpu.sh
@@ -0,0 +1,75 @@
+#!/bin/bash
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+set -e
+set -x
+
+if [[ -n "${KOKORO_ARTIFACTS_DIR}" ]]; then
+  cd "${KOKORO_ARTIFACTS_DIR}"
+  ls
+  source "$(find "${KOKORO_ARTIFACTS_DIR}" -name "common_google.sh")"
+  cd git/gob-tensorflow
+
+fi
+
+if [[ -z "${TF_KOKORO_PY_VERSION}" ]]; then
+  echo "You must set TF_KOKORO_PY_VERSION, e.g. '3.7', indicating the "
+  echo "Python version to be used for this build."
+  exit 2
+fi
+
+source tensorflow/tools/ci_build/release/common.sh
+
+install_ubuntu_16_pip_deps "pip${TF_KOKORO_PY_VERSION}"
+# Update bazel
+install_bazelisk
+
+# Run configure.
+export TF_NEED_GCP=1
+export TF_NEED_HDFS=1
+export TF_NEED_S3=1
+export TF_NEED_CUDA=1
+export TF_CUDA_VERSION=10
+export TF_CUDNN_VERSION=7
+export TF_NEED_TENSORRT=1
+export TENSORRT_INSTALL_PATH=/usr/local/tensorrt
+export CC_OPT_FLAGS='-mavx'
+export PYTHON_BIN_PATH=$(which "python${TF_KOKORO_PY_VERSION}")
+export TF2_BEHAVIOR=1
+export PROJECT_NAME="tensorflow_gpu"
+export LD_LIBRARY_PATH="/usr/local/cuda:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:$TENSORRT_INSTALL_PATH/lib"
+export TF_CUDA_COMPUTE_CAPABILITIES=sm_35,sm_37,sm_52,sm_60,sm_61,compute_70
+
+yes "" | "$PYTHON_BIN_PATH" configure.py
+
+# Get the default test targets for bazel.
+source tensorflow/tools/ci_build/build_scripts/PRESUBMIT_BUILD_TARGETS.sh
+
+# Exclude -no_oss_py36, for example
+tag_filters="gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_oss_py${TF_KOKORO_PY_VERSION//.}"
+
+set +e
+bazel test --config=cuda --config=opt \
+  --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain \
+  --linkopt=-lrt \
+  --action_env=TF2_BEHAVIOR="${TF2_BEHAVIOR}" \
+  --test_lang_filters=py \
+  --build_tag_filters=${tag_filters} \
+  --test_tag_filters=${tag_filters} \
+  --test_timeout="300,450,1200,3600" --local_test_jobs=4 \
+  --test_output=errors --verbose_failures=true --keep_going \
+  --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute \
+  -- ${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/...
+test_xml_summary_exit
diff --git a/tensorflow/tools/ci_build/release/common_win.bat b/tensorflow/tools/ci_build/release/common_win.bat
index 0918f4c43b0..fa577fcfc33 100644
--- a/tensorflow/tools/ci_build/release/common_win.bat
+++ b/tensorflow/tools/ci_build/release/common_win.bat
@@ -30,7 +30,7 @@ SET PATH=%PATH%;C:\%PYTHON_DIRECTORY%
 %PIP_EXE% install future>=0.17.1 --no-deps
 %PIP_EXE% install --ignore-installed --force-reinstall --upgrade tf-estimator-nightly --no-deps
 %PIP_EXE% install tb-nightly --no-deps
-%PIP_EXE% install numpy --upgrade --no-deps
+%PIP_EXE% install numpy==1.16.0 --upgrade --no-deps
 %PIP_EXE% install opt_einsum --upgrade
 %PIP_EXE% install pandas --upgrade --no-deps
 %PIP_EXE% install protobuf --upgrade --no-deps
diff --git a/tensorflow/tools/compatibility/tf_upgrade_v2.py b/tensorflow/tools/compatibility/tf_upgrade_v2.py
index d27c75fb44e..1475a74d62f 100644
--- a/tensorflow/tools/compatibility/tf_upgrade_v2.py
+++ b/tensorflow/tools/compatibility/tf_upgrade_v2.py
@@ -883,7 +883,7 @@ class TFAPIChangeSpec(ast_edits.NoUpdateSpec):
     contrib_tpu_strategy_warning = (
         ast_edits.ERROR,
         "(Manual edit required) tf.contrib.distribute.TPUStrategy has "
-        "been migrated to tf.distribute.experimental.TPUStrategy. Note the "
+        "been migrated to tf.distribute.TPUStrategy. Note the "
         "slight changes in constructor. " + distribute_strategy_api_changes)
 
     contrib_collective_strategy_warning = (
diff --git a/tensorflow/tools/compatibility/tf_upgrade_v2_test.py b/tensorflow/tools/compatibility/tf_upgrade_v2_test.py
index 47b9899a6b7..185a3b07f8d 100644
--- a/tensorflow/tools/compatibility/tf_upgrade_v2_test.py
+++ b/tensorflow/tools/compatibility/tf_upgrade_v2_test.py
@@ -2155,7 +2155,7 @@ def _log_prob(self, x):
     expected = "tf.contrib.distribute.TPUStrategy"
     _, _, errors, new_text = self._upgrade(text)
     self.assertEqual(expected, new_text)
-    self.assertIn("migrated to tf.distribute.experimental.TPUStrategy",
+    self.assertIn("migrated to tf.distribute.TPUStrategy",
                   errors[0])
 
     text = "tf.contrib.distribute.foo"
diff --git a/tensorflow/tools/def_file_filter/symbols_pybind.txt b/tensorflow/tools/def_file_filter/symbols_pybind.txt
index e72ef973ff2..07f5906aa08 100644
--- a/tensorflow/tools/def_file_filter/symbols_pybind.txt
+++ b/tensorflow/tools/def_file_filter/symbols_pybind.txt
@@ -17,6 +17,7 @@ tensorflow::swig::Flatten
 tensorflow::swig::IsSequenceForData
 tensorflow::swig::FlattenForData
 tensorflow::swig::AssertSameStructureForData
+tensorflow::swig::RegisterPyObject
 tensorflow::swig::RegisterType
 tensorflow::swig::IsEagerTensorSlow
 
diff --git a/tensorflow/tools/docs/BUILD b/tensorflow/tools/docs/BUILD
index f347cab91bb..d2f5469a55c 100644
--- a/tensorflow/tools/docs/BUILD
+++ b/tensorflow/tools/docs/BUILD
@@ -11,7 +11,7 @@ package(
 
 exports_files(["LICENSE"])
 
-tpu_module = "tpu.,distribute.tpu_strategy"
+tpu_module = "tpu.,distribute.tpu_strategy,distribute.cluster_resolver.tpu,distribute.cluster_resolver.tpu_oss"
 
 py_library(
     name = "tf_doctest_lib",
diff --git a/tensorflow/tools/docs/doc_generator_visitor.py b/tensorflow/tools/docs/doc_generator_visitor.py
index ec2102a5935..592e7c1f966 100644
--- a/tensorflow/tools/docs/doc_generator_visitor.py
+++ b/tensorflow/tools/docs/doc_generator_visitor.py
@@ -240,10 +240,11 @@ class DocGeneratorVisitor(object):
       # We cannot use the duplicate mechanism for some constants, since e.g.,
       # id(c1) == id(c2) with c1=1, c2=1. This is unproblematic since constants
       # have no usable docstring and won't be documented automatically.
-      if (py_object is not None and
-          not isinstance(py_object, six.integer_types + six.string_types +
-                         (six.binary_type, six.text_type, float, complex, bool))
-          and py_object is not ()):  # pylint: disable=literal-comparison
+      singelton_types = (
+          six.integer_types + six.string_types +
+          (six.binary_type, six.text_type, float, complex, bool))
+      if (py_object not in (None, ()) and
+          not isinstance(py_object, singelton_types)):
         object_id = id(py_object)
         if object_id in reverse_index:
           master_name = reverse_index[object_id]
diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py
index 92061b396ce..42233f80c1c 100644
--- a/tensorflow/tools/pip_package/setup.py
+++ b/tensorflow/tools/pip_package/setup.py
@@ -58,7 +58,9 @@ REQUIRED_PACKAGES = [
     'google_pasta >= 0.1.8',
     'h5py >= 2.10.0, < 2.11.0',
     'keras_preprocessing >= 1.1.1, < 1.2',
-    'numpy >= 1.16.0, < 2.0',
+    # TODO(mihaimaruseac): numpy 1.19.0 has ABI breakage
+    # https://github.com/numpy/numpy/pull/15355
+    'numpy >= 1.16.0, < 1.19.0',
     'opt_einsum >= 2.3.2',
     'protobuf >= 3.9.2',
     'tensorboard >= 2.2.0, < 2.3.0',
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index cb1ea721fb0..d34e7d973d3 100755
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -164,11 +164,11 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
 
     tf_http_archive(
         name = "XNNPACK",
-        sha256 = "bd5fd63a09222cd092f0c058b576cf044fb4074f2c4ce8a6fc32fc43d155f9c7",
-        strip_prefix = "XNNPACK-ae046f5a5127084bfe41090afdf1c1d4c9874b77",
+        sha256 = "2527a30464b43bd03f137b2c455a0381e49eae63d09cfeee128a717dfbe962d5",
+        strip_prefix = "XNNPACK-8b283aa30a3186c6e640aed520543e9c067132d2",
         urls = [
-            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/google/XNNPACK/archive/ae046f5a5127084bfe41090afdf1c1d4c9874b77.zip",
-            "https://github.com/google/XNNPACK/archive/ae046f5a5127084bfe41090afdf1c1d4c9874b77.zip",
+            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/google/XNNPACK/archive/8b283aa30a3186c6e640aed520543e9c067132d2.zip",
+            "https://github.com/google/XNNPACK/archive/8b283aa30a3186c6e640aed520543e9c067132d2.zip",
         ],
     )
 
@@ -184,11 +184,11 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
 
     tf_http_archive(
         name = "pthreadpool",
-        sha256 = "c4d4a16053ec0e5125dbd8ae1d6d2ba99601d6fdcf8601a0d51d02a048c40348",
-        strip_prefix = "pthreadpool-e1642461b3b0217d23d6664d839a060f54e4e652",
+        sha256 = "03312bd7d8d9e379d685258963ee8820767158b5946cdd00336ff17dae851001",
+        strip_prefix = "pthreadpool-029c88620802e1361ccf41d1970bd5b07fd6b7bb",
         urls = [
-            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/Maratyszcza/pthreadpool/archive/e1642461b3b0217d23d6664d839a060f54e4e652.zip",
-            "https://github.com/Maratyszcza/pthreadpool/archive/e1642461b3b0217d23d6664d839a060f54e4e652.zip",
+            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/Maratyszcza/pthreadpool/archive/029c88620802e1361ccf41d1970bd5b07fd6b7bb.zip",
+            "https://github.com/Maratyszcza/pthreadpool/archive/029c88620802e1361ccf41d1970bd5b07fd6b7bb.zip",
         ],
     )
 
@@ -237,11 +237,11 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
         name = "eigen_archive",
         build_file = clean_dep("//third_party:eigen.BUILD"),
         patch_file = clean_dep("//third_party/eigen3:gpu_packet_math.patch"),
-        sha256 = "a6b790c40e637cddc2554a7bed40622d30311210c8ce94524cdf5496417fd2bc",  # SHARED_EIGEN_SHA
-        strip_prefix = "eigen-6228f27234ca84545e711fb27b7850f6829af3d9",
+        sha256 = "f632d82e43ffc46adfac9043beace700b0265748075e7edc0701d81380258038",  # SHARED_EIGEN_SHA
+        strip_prefix = "eigen-386d809bde475c65b7940f290efe80e6a05878c4",
         urls = [
-            "https://storage.googleapis.com/mirror.tensorflow.org/gitlab.com/libeigen/eigen/-/archive/6228f27234ca84545e711fb27b7850f6829af3d9/eigen-6228f27234ca84545e711fb27b7850f6829af3d9.tar.gz",
-            "https://gitlab.com/libeigen/eigen/-/archive/6228f27234ca84545e711fb27b7850f6829af3d9/eigen-6228f27234ca84545e711fb27b7850f6829af3d9.tar.gz",
+            "https://storage.googleapis.com/mirror.tensorflow.org/gitlab.com/libeigen/eigen/-/archive/386d809bde475c65b7940f290efe80e6a05878c4/eigen-386d809bde475c65b7940f290efe80e6a05878c4.tar.gz",
+            "https://gitlab.com/libeigen/eigen/-/archive/386d809bde475c65b7940f290efe80e6a05878c4/eigen-386d809bde475c65b7940f290efe80e6a05878c4.tar.gz",
         ],
     )
 
@@ -710,8 +710,8 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
     )
 
     # Check out LLVM and MLIR from llvm-project.
-    LLVM_COMMIT = "4799fb63b5513f655ca8e85416ec8fe35df49bae"
-    LLVM_SHA256 = "f401a61bd7f5b05bd8a3ffdfb1f32e9379cae2c8e988f3ae6772b588ad97c84a"
+    LLVM_COMMIT = "f1c671925b1c60ded3e4e7b3c6b1ec984b2d9b93"
+    LLVM_SHA256 = "57fc8f0ab46bdfdff52b03c2196d658c094bc4179cd1cf9495becf6a8466123a"
     LLVM_URLS = [
         "https://storage.googleapis.com/mirror.tensorflow.org/github.com/llvm/llvm-project/archive/{commit}.tar.gz".format(commit = LLVM_COMMIT),
         "https://github.com/llvm/llvm-project/archive/{commit}.tar.gz".format(commit = LLVM_COMMIT),
@@ -802,11 +802,11 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
         name = "nccl_archive",
         build_file = clean_dep("//third_party:nccl/archive.BUILD"),
         patch_file = clean_dep("//third_party/nccl:archive.patch"),
-        sha256 = "7ff66aca18392b162829612e02c00b123a58ec35869334f72d7e5afaf5ea4a13",
-        strip_prefix = "nccl-3701130b3c1bcdb01c14b3cb70fe52498c1e82b7",
+        sha256 = "67e15ce3d12ba9ea1e0cb239599202b0f61c146149699341043c072de388e90a",
+        strip_prefix = "nccl-5949d96f36d050e59d05872f8bbffd2549318e95",
         urls = [
-            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/nvidia/nccl/archive/3701130b3c1bcdb01c14b3cb70fe52498c1e82b7.tar.gz",
-            "https://github.com/nvidia/nccl/archive/3701130b3c1bcdb01c14b3cb70fe52498c1e82b7.tar.gz",
+            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/nvidia/nccl/archive/5949d96f36d050e59d05872f8bbffd2549318e95.tar.gz",
+            "https://github.com/nvidia/nccl/archive/5949d96f36d050e59d05872f8bbffd2549318e95.tar.gz",
         ],
     )
 
diff --git a/third_party/gpus/crosstool/cc_toolchain_config.bzl.tpl b/third_party/gpus/crosstool/cc_toolchain_config.bzl.tpl
index 082ed950b04..eb320a94201 100644
--- a/third_party/gpus/crosstool/cc_toolchain_config.bzl.tpl
+++ b/third_party/gpus/crosstool/cc_toolchain_config.bzl.tpl
@@ -583,7 +583,11 @@ def _features(cpu, compiler, ctx):
                     ),
                 ],
             ),
-            feature(name = "opt"),
+            feature(name = "disable-assertions"),
+            feature(
+                name = "opt",
+                implies = ["disable-assertions"],
+            ),
             feature(name = "fastbuild"),
             feature(name = "dbg"),
             feature(name = "supports_dynamic_linker", enabled = True),
diff --git a/third_party/gpus/cuda/BUILD b/third_party/gpus/cuda/BUILD
index 413d28a2723..e69de29bb2d 100644
--- a/third_party/gpus/cuda/BUILD
+++ b/third_party/gpus/cuda/BUILD
@@ -1,9 +0,0 @@
-load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
-
-bzl_library(
-    name = "build_defs_bzl",
-    srcs = ["build_defs.bzl"],
-    deps = [
-        "@bazel_skylib//lib:selects",
-    ],
-)
diff --git a/third_party/gpus/cuda/BUILD.windows.tpl b/third_party/gpus/cuda/BUILD.windows.tpl
index e83da1429dd..55a9ec3d1ab 100644
--- a/third_party/gpus/cuda/BUILD.windows.tpl
+++ b/third_party/gpus/cuda/BUILD.windows.tpl
@@ -196,6 +196,14 @@ cc_library(
     data = [":cuda-nvvm"],
 )
 
+filegroup(
+    name = "cuda_root",
+    srcs = [
+        "cuda/bin/fatbinary.exe",
+        "cuda/bin/bin2c.exe",
+    ],
+)
+
 bzl_library(
     name = "build_defs_bzl",
     srcs = ["build_defs.bzl"],
diff --git a/third_party/gpus/cuda_configure.bzl b/third_party/gpus/cuda_configure.bzl
index 0b87ba1ae2a..a192c022a47 100644
--- a/third_party/gpus/cuda_configure.bzl
+++ b/third_party/gpus/cuda_configure.bzl
@@ -1062,35 +1062,35 @@ def _create_local_cuda_repository(repository_ctx):
     ))
 
     # copy files mentioned in third_party/nccl/build_defs.bzl.tpl
+    file_ext = ".exe" if is_windows(repository_ctx) else ""
     copy_rules.append(make_copy_files_rule(
         repository_ctx,
         name = "cuda-bin",
         srcs = [
             cuda_config.cuda_toolkit_path + "/bin/" + "crt/link.stub",
-            cuda_config.cuda_toolkit_path + "/bin/" + "nvlink",
-            cuda_config.cuda_toolkit_path + "/bin/" + "fatbinary",
-            cuda_config.cuda_toolkit_path + "/bin/" + "bin2c",
+            cuda_config.cuda_toolkit_path + "/bin/" + "nvlink" + file_ext,
+            cuda_config.cuda_toolkit_path + "/bin/" + "fatbinary" + file_ext,
+            cuda_config.cuda_toolkit_path + "/bin/" + "bin2c" + file_ext,
         ],
         outs = [
             "cuda/bin/" + "crt/link.stub",
-            "cuda/bin/" + "nvlink",
-            "cuda/bin/" + "fatbinary",
-            "cuda/bin/" + "bin2c",
+            "cuda/bin/" + "nvlink" + file_ext,
+            "cuda/bin/" + "fatbinary" + file_ext,
+            "cuda/bin/" + "bin2c" + file_ext,
         ],
     ))
 
     # Select the headers based on the cuDNN version (strip '64_' for Windows).
-    if cuda_config.cudnn_version.rsplit("_", 1)[0] < "8":
-        cudnn_headers = ["cudnn.h"]
-    else:
-        cudnn_headers = [
+    cudnn_headers = ["cudnn.h"]
+    if cuda_config.cudnn_version.rsplit("_", 1)[0] >= "8":
+        cudnn_headers += [
+            "cudnn_backend.h",
             "cudnn_adv_infer.h",
             "cudnn_adv_train.h",
             "cudnn_cnn_infer.h",
             "cudnn_cnn_train.h",
             "cudnn_ops_infer.h",
             "cudnn_ops_train.h",
-            "cudnn.h",
             "cudnn_version.h",
         ]
 
diff --git a/third_party/gpus/rocm/BUILD b/third_party/gpus/rocm/BUILD
index bc2dd419259..e69de29bb2d 100644
--- a/third_party/gpus/rocm/BUILD
+++ b/third_party/gpus/rocm/BUILD
@@ -1,6 +0,0 @@
-load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
-
-bzl_library(
-    name = "build_defs_bzl",
-    srcs = ["build_defs.bzl"],
-)
diff --git a/third_party/llvm/llvm.autogenerated.BUILD b/third_party/llvm/llvm.autogenerated.BUILD
index 50b71a3686f..c70ff559165 100644
--- a/third_party/llvm/llvm.autogenerated.BUILD
+++ b/third_party/llvm/llvm.autogenerated.BUILD
@@ -698,6 +698,7 @@ cc_library(
         "lib/Target/AArch64/*.c",
         "lib/Target/AArch64/*.cpp",
         "lib/Target/AArch64/*.inc",
+        "lib/Target/AArch64/GISel/*.cpp",
     ]),
     hdrs = glob([
         "include/llvm/Target/AArch64/*.h",
diff --git a/third_party/mkl_dnn/mkldnn_v1.BUILD b/third_party/mkl_dnn/mkldnn_v1.BUILD
index 47a7efecda3..7bdec138b99 100644
--- a/third_party/mkl_dnn/mkldnn_v1.BUILD
+++ b/third_party/mkl_dnn/mkldnn_v1.BUILD
@@ -71,6 +71,8 @@ cc_library(
         "src/cpu/**/*.cpp",
         "src/cpu/**/*.hpp",
         "src/cpu/xbyak/*.h",
+        "src/cpu/jit_utils/jitprofiling/*.c",
+        "src/cpu/jit_utils/jitprofiling/*.h",
     ]) + [
         ":dnnl_config_h",
         ":dnnl_version_h",
diff --git a/third_party/mlir/BUILD b/third_party/mlir/BUILD
index cb0b2f9dc8e..8fd0a94bf64 100644
--- a/third_party/mlir/BUILD
+++ b/third_party/mlir/BUILD
@@ -282,7 +282,7 @@ cc_library(
     deps = [
         ":AVX512IncGen",
         ":IR",
-        ":SideEffects",
+        ":SideEffectInterfaces",
         ":VectorOps",
         "@llvm-project//llvm:Core",
         "@llvm-project//llvm:Support",
@@ -305,9 +305,9 @@ cc_library(
         ":IR",
         ":LLVMAVX512",
         ":LLVMDialect",
-        ":LLVMTransforms",
         ":Pass",
         ":StandardOps",
+        ":StandardToLLVM",
         ":Support",
         ":Transforms",
         ":VectorOps",
@@ -489,7 +489,7 @@ cc_library(
         ":EDSC",
         ":IR",
         ":LoopLikeInterface",
-        ":SideEffects",
+        ":SideEffectInterfaces",
         ":StandardOps",
         ":Support",
         "@llvm-project//llvm:Support",
@@ -571,7 +571,7 @@ gentbl(
 )
 
 cc_library(
-    name = "AffineToStandardTransforms",
+    name = "AffineToStandard",
     srcs = glob([
         "lib/Conversion/AffineToStandard/*.cpp",
         "lib/Conversion/AffineToStandard/*.h",
@@ -591,6 +591,11 @@ cc_library(
     ],
 )
 
+alias(
+    name = "AffineToStandardTransforms",
+    actual = "AffineToStandard",
+)
+
 # SDBM dialect only contains attribute components that can be constructed given
 # a dialect object, so whenever it is used it must also be registered. Therefore
 # we don't split out the registration library for it.
@@ -631,7 +636,7 @@ cc_library(
         ":IR",
         ":LoopLikeInterface",
         ":SCFIncGen",
-        ":SideEffects",
+        ":SideEffectInterfaces",
         ":StandardOps",
         ":Support",
         "@llvm-project//llvm:Support",
@@ -719,7 +724,7 @@ cc_library(
         ":InferTypeOpInterface",
         ":MLIRShapeCanonicalizationIncGen",
         ":ShapeOpsIncGen",
-        ":SideEffects",
+        ":SideEffectInterfaces",
         ":Support",
         "@llvm-project//llvm:Support",
     ],
@@ -738,12 +743,32 @@ cc_library(
         ":Pass",
         ":SCFDialect",
         ":Shape",
+        ":ShapeToStandardPatternsIncGen",
         ":StandardOps",
         ":Support",
         ":Transforms",
     ],
 )
 
+gentbl(
+    name = "ShapeToStandardPatternsIncGen",
+    strip_include_prefix = "include/mlir/Conversion/ShapeToStandard",
+    tbl_outs = [
+        (
+            "-gen-rewriters",
+            "include/mlir/Conversion/ShapeToStandard/ShapeToStandardPatterns.inc",
+        ),
+    ],
+    tblgen = ":mlir-tblgen",
+    td_file = "lib/Conversion/ShapeToStandard/ShapeToStandardPatterns.td",
+    td_srcs = [
+        ":StdOpsTdFiles",
+        "include/mlir/Dialect/Shape/IR/ShapeBase.td",
+        "include/mlir/Dialect/Shape/IR/ShapeOps.td",
+        "include/mlir/Interfaces/InferTypeOpInterface.td",
+    ],
+)
+
 cc_library(
     name = "ShapeToSCF",
     srcs = glob([
@@ -813,7 +838,7 @@ cc_library(
         ":ControlFlowInterfaces",
         ":EDSC",
         ":IR",
-        ":SideEffects",
+        ":SideEffectInterfaces",
         ":StandardOpsIncGen",
         ":Support",
         ":ViewLikeInterface",
@@ -875,7 +900,7 @@ cc_library(
         ":DialectUtils",
         ":EDSC",
         ":IR",
-        ":SideEffects",
+        ":SideEffectInterfaces",
         ":StandardOps",
         ":Support",
         ":VectorOpsIncGen",
@@ -1050,7 +1075,7 @@ cc_library(
         ":ControlFlowInterfaces",
         ":IR",
         ":LLVMOpsIncGen",
-        ":SideEffects",
+        ":SideEffectInterfaces",
         ":Support",
         "@llvm-project//llvm:AsmParser",
         "@llvm-project//llvm:BitReader",
@@ -1173,7 +1198,7 @@ cc_library(
         ":GPUOpsIncGen",
         ":IR",
         ":LLVMDialect",
-        ":SideEffects",
+        ":SideEffectInterfaces",
         ":StandardOps",
         ":Support",
     ],
@@ -1251,8 +1276,8 @@ cc_library(
         ":GPUDialect",
         ":IR",
         ":LLVMDialect",
-        ":LLVMTransforms",
         ":StandardOps",
+        ":StandardToLLVM",
         "@llvm-project//llvm:Support",
     ],
 )
@@ -1291,9 +1316,9 @@ cc_library(
         ":GPUToNVVMGen",
         ":GPUTransforms",
         ":IR",
-        ":LLVMTransforms",
         ":NVVMDialect",
         ":Pass",
+        ":StandardToLLVM",
         ":Transforms",
         "@llvm-project//llvm:Support",
     ],
@@ -1313,10 +1338,10 @@ cc_library(
         ":ConversionPassIncGen",
         ":GPUDialect",
         ":LLVMDialect",
-        ":LLVMTransforms",
         ":Pass",
         ":ROCDLDialect",
         ":StandardOps",
+        ":StandardToLLVM",
         ":Transforms",
         ":VectorOps",
     ],
@@ -1355,9 +1380,9 @@ cc_library(
         ":GPUDialect",
         ":GPUToROCDLTGen",
         ":GPUTransforms",
-        ":LLVMTransforms",
         ":Pass",
         ":ROCDLDialect",
+        ":StandardToLLVM",
         ":Transforms",
         ":VectorOps",
         ":VectorToLLVM",
@@ -1455,7 +1480,7 @@ cc_library(
         ":SCFDialect",
         ":SPIRVDialect",
         ":SPIRVLowering",
-        ":StandardToSPIRVConversions",
+        ":StandardToSPIRVTransforms",
         ":Support",
         ":Transforms",
     ],
@@ -1476,10 +1501,10 @@ cc_library(
         ":ConversionPassIncGen",
         ":IR",
         ":LLVMDialect",
-        ":LLVMTransforms",
         ":Pass",
         ":SPIRVDialect",
         ":StandardOps",
+        ":StandardToLLVM",
         ":Support",
         ":Transforms",
     ],
@@ -1554,7 +1579,7 @@ cc_library(
         ":IR",
         ":LLVMDialect",
         ":NVVMOpsIncGen",
-        ":SideEffects",
+        ":SideEffectInterfaces",
         ":StandardOps",
         ":Support",
         "@llvm-project//llvm:AsmParser",
@@ -1626,7 +1651,7 @@ cc_library(
         ":IR",
         ":LLVMDialect",
         ":ROCDLOpsIncGen",
-        ":SideEffects",
+        ":SideEffectInterfaces",
         ":StandardOps",
         ":Support",
         "@llvm-project//llvm:AsmParser",
@@ -1874,7 +1899,7 @@ cc_library(
         ":SPIRVOpsIncGen",
         ":SPIRVSerializationGen",
         ":SPIRVTargetAndABIStructGen",
-        ":SideEffects",
+        ":SideEffectInterfaces",
         ":Support",
         ":Transforms",
         "@llvm-project//llvm:Support",
@@ -1927,7 +1952,7 @@ cc_library(
 )
 
 cc_library(
-    name = "StandardToSPIRVConversions",
+    name = "StandardToSPIRVTransforms",
     srcs = glob([
         "lib/Conversion/StandardToSPIRV/*.cpp",
         "lib/Conversion/StandardToSPIRV/*.h",
@@ -1948,10 +1973,16 @@ cc_library(
         ":StandardOps",
         ":Support",
         ":Transforms",
+        ":VectorOps",
         "@llvm-project//llvm:Support",
     ],
 )
 
+alias(
+    name = "StandardToSPIRVConversions",
+    actual = "StandardToSPIRVTransforms",
+)
+
 cc_library(
     name = "SPIRVSerialization",
     srcs = glob(
@@ -2013,7 +2044,7 @@ cc_library(
         ":IR",
         ":LoopLikeInterface",
         ":SCFDialect",
-        ":SideEffects",
+        ":SideEffectInterfaces",
         ":StandardOps",
         ":Support",
         "@llvm-project//llvm:Support",
@@ -2132,7 +2163,7 @@ cc_library(
         ":LoopLikeInterface",
         ":Pass",
         ":SCFDialect",
-        ":SideEffects",
+        ":SideEffectInterfaces",
         ":StandardOps",
         ":Support",
         ":TransformUtils",
@@ -2162,7 +2193,7 @@ cc_library(
     includes = ["include"],
     deps = [
         ":Affine",
-        ":AffineToStandardTransforms",
+        ":AffineToStandard",
         ":ConversionPassIncGen",
         ":GPUDialect",
         ":GPUTransforms",
@@ -2202,7 +2233,7 @@ cc_library(
 )
 
 cc_library(
-    name = "CFGTransforms",
+    name = "SCFToStandard",
     srcs = [
         "lib/Conversion/PassDetail.h",
         "lib/Conversion/SCFToStandard/SCFToStandard.cpp",
@@ -2224,8 +2255,13 @@ cc_library(
     ],
 )
 
+alias(
+    name = "CFGTransforms",
+    actual = "SCFToStandard",
+)
+
 cc_library(
-    name = "LLVMTransforms",
+    name = "StandardToLLVM",
     srcs = [
         "lib/Conversion/PassDetail.h",
         "lib/Conversion/StandardToLLVM/StandardToLLVM.cpp",
@@ -2249,6 +2285,11 @@ cc_library(
     ],
 )
 
+alias(
+    name = "LLVMTransforms",
+    actual = "StandardToLLVM",
+)
+
 gentbl(
     name = "CallOpInterfacesIncGen",
     strip_include_prefix = "include",
@@ -2381,7 +2422,7 @@ gentbl(
 )
 
 cc_library(
-    name = "SideEffects",
+    name = "SideEffectInterfaces",
     srcs = [
         "lib/Interfaces/SideEffectInterfaces.cpp",
     ],
@@ -2397,6 +2438,11 @@ cc_library(
     ],
 )
 
+alias(
+    name = "SideEffects",
+    actual = "SideEffectInterfaces",
+)
+
 cc_library(
     name = "Analysis",
     srcs = glob(
@@ -2607,7 +2653,6 @@ cc_library(
         ":GPUTransforms",
         ":IR",
         ":LLVMDialect",
-        ":LLVMTransforms",
         ":LinalgToLLVM",
         ":LinalgToSPIRV",
         ":LinalgToStandard",
@@ -2619,7 +2664,8 @@ cc_library(
         ":ShapeToStandard",
         ":ShapeTransforms",
         ":StandardOpsTransforms",
-        ":StandardToSPIRVConversions",
+        ":StandardToLLVM",
+        ":StandardToSPIRVTransforms",
         ":Support",
         ":Transforms",
         ":VectorToLLVM",
@@ -2679,7 +2725,6 @@ cc_library(
         ":Affine",
         ":AffinePassIncGen",
         ":AffineTransforms",
-        ":CFGTransforms",
         ":ConversionPassIncGen",
         ":GPUDialect",
         ":GPUPassIncGen",
@@ -2694,7 +2739,6 @@ cc_library(
         ":LLVMDialect",
         ":LLVMIRTransforms",
         ":LLVMPassIncGen",
-        ":LLVMTransforms",
         ":LinalgOps",
         ":LinalgPassIncGen",
         ":LinalgToLLVM",
@@ -2709,6 +2753,7 @@ cc_library(
         ":ROCDLDialect",
         ":SCFDialect",
         ":SCFToGPUPass",
+        ":SCFToStandard",
         ":SCFTransforms",
         ":SDBM",
         ":SPIRVDialect",
@@ -2723,7 +2768,8 @@ cc_library(
         ":StandardOps",
         ":StandardOpsTransforms",
         ":StandardOpsTransformsPassIncGen",
-        ":StandardToSPIRVConversions",
+        ":StandardToLLVM",
+        ":StandardToSPIRVTransforms",
         ":Transforms",
         ":TransformsPassIncGen",
         ":VectorOps",
@@ -2789,13 +2835,13 @@ cc_library(
     includes = ["include"],
     deps = [
         ":AllPassesAndDialectsNoRegistration",
-        ":CFGTransforms",
         ":ExecutionEngine",
         ":ExecutionEngineUtils",
         ":IR",
         ":LLVMDialect",
         ":Parser",
         ":Pass",
+        ":SCFToStandard",
         ":Support",
         "@llvm-project//llvm:Core",
         "@llvm-project//llvm:OrcJIT",
@@ -2865,12 +2911,12 @@ cc_library(
         ":IR",
         ":Pass",
         ":SPIRVDialect",
-        ":SideEffects",
+        ":SideEffectInterfaces",
         ":StandardOps",
         ":Support",
-        "//third_party/vulkan_loader",
         "@llvm-project//llvm:Support",
         "@vulkan_headers",
+        "@vulkan_sdk//:sdk",
     ],
 )
 
@@ -2898,10 +2944,10 @@ cc_binary(
         ":GPUTransforms",
         ":IR",
         ":LLVMDialect",
-        ":LLVMTransforms",
         ":MlirJitRunner",
         ":NVVMDialect",
         ":Pass",
+        ":StandardToLLVM",
         ":TargetNVVMIR",
         ":Transforms",
         "//devtools/build/runtime:get_runfiles_dir",
@@ -2925,11 +2971,11 @@ cc_binary(
         ":GPUToSPIRVTransforms",
         ":GPUToVulkanTransforms",
         ":GPUTransforms",
-        ":LLVMTransforms",
         ":MlirJitRunner",
         ":Pass",
         ":SPIRVDialect",
-        ":StandardToSPIRVConversions",
+        ":StandardToLLVM",
+        ":StandardToSPIRVTransforms",
         "@llvm-project//llvm:Support",
     ],
 )
@@ -3139,7 +3185,7 @@ cc_library(
         ":Pass",
         ":QuantOpsIncGen",
         ":QuantPassIncGen",
-        ":SideEffects",
+        ":SideEffectInterfaces",
         ":StandardOps",
         "@llvm-project//llvm:Support",
     ],
@@ -3274,19 +3320,18 @@ cc_library(
     ]),
     includes = ["include"],
     deps = [
-        ":AffineToStandardTransforms",
+        ":AffineToStandard",
         ":Analysis",
-        ":CFGTransforms",
         ":ConversionPassIncGen",
         ":EDSC",
         ":IR",
         ":LLVMDialect",
-        ":LLVMTransforms",
         ":LinalgOps",
         ":LinalgTransforms",
         ":Pass",
+        ":SCFToStandard",
         ":StandardOps",
-        ":StandardOpsTransforms",
+        ":StandardToLLVM",
         ":Support",
         ":Transforms",
         ":VectorToLLVM",
@@ -3366,7 +3411,7 @@ cc_library(
         ":LinalgOpsIncGen",
         ":LinalgStructuredOpsIncGen",
         ":Parser",
-        ":SideEffects",
+        ":SideEffectInterfaces",
         ":StandardOps",
         ":Support",
         ":ViewLikeInterface",
@@ -3412,21 +3457,21 @@ cc_library(
     includes = ["include"],
     deps = [
         ":Affine",
-        ":AffineToStandardTransforms",
+        ":AffineToStandard",
         ":Analysis",
-        ":CFGTransforms",
         ":DialectUtils",
         ":EDSC",
         ":IR",
         ":LLVMDialect",
-        ":LLVMTransforms",
         ":LinalgOps",
         ":LinalgPassIncGen",
         ":LinalgStructuredOpsIncGen",
         ":Pass",
         ":SCFDialect",
+        ":SCFToStandard",
         ":SCFTransforms",
         ":StandardOps",
+        ":StandardToLLVM",
         ":Support",
         ":TransformUtils",
         ":Transforms",
@@ -3518,9 +3563,9 @@ cc_library(
         ":EDSC",
         ":IR",
         ":LLVMDialect",
-        ":LLVMTransforms",
         ":Pass",
         ":StandardOps",
+        ":StandardToLLVM",
         ":Support",
         ":Transforms",
         ":VectorOps",
@@ -3545,10 +3590,10 @@ cc_library(
         ":EDSC",
         ":IR",
         ":LLVMDialect",
-        ":LLVMTransforms",
         ":Pass",
         ":SCFDialect",
         ":StandardOps",
+        ":StandardToLLVM",
         ":Support",
         ":Transforms",
         ":VectorOps",
diff --git a/third_party/nccl/archive.BUILD b/third_party/nccl/archive.BUILD
index 9b0c9bdda1d..028b348caff 100644
--- a/third_party/nccl/archive.BUILD
+++ b/third_party/nccl/archive.BUILD
@@ -46,6 +46,7 @@ gen_device_srcs(
         "src/collectives/device/broadcast.cu.cc",
         "src/collectives/device/reduce.cu.cc",
         "src/collectives/device/reduce_scatter.cu.cc",
+        "src/collectives/device/sendrecv.cu.cc",
     ],
 )
 
diff --git a/third_party/nccl/archive.patch b/third_party/nccl/archive.patch
index ea0ead71fb2..94ef48d00e8 100644
--- a/third_party/nccl/archive.patch
+++ b/third_party/nccl/archive.patch
@@ -22,6 +22,10 @@ diff --git a/src/collectives/device/reduce_scatter.cu b/src/collectives/device/r
 similarity index 100%
 rename from src/collectives/device/reduce_scatter.cu
 rename to src/collectives/device/reduce_scatter.cu.cc
+diff --git a/src/collectives/device/sendrecv.cu b/src/collectives/device/sendrecv.cu.cc
+similarity index 100%
+rename from src/collectives/device/sendrecv.cu
+rename to src/collectives/device/sendrecv.cu.cc
 diff --git a/src/nccl.h.in b/src/nccl.h
 similarity index 98%
 rename from src/nccl.h.in
@@ -38,12 +42,183 @@ index 985274e..7ebb1e1 100644
 -#define NCCL_PATCH ${nccl:Patch}
 -#define NCCL_SUFFIX "${nccl:Suffix}"
 +#define NCCL_MAJOR 2
-+#define NCCL_MINOR 5
-+#define NCCL_PATCH 7
++#define NCCL_MINOR 7
++#define NCCL_PATCH 3
 +#define NCCL_SUFFIX ""
  
 -#define NCCL_VERSION_CODE ${nccl:Version}
-+#define NCCL_VERSION_CODE 2507
++#define NCCL_VERSION_CODE 2703
  #define NCCL_VERSION(X,Y,Z) ((X) * 1000 + (Y) * 100 + (Z))
  
  #ifdef __cplusplus
+See https://github.com/NVIDIA/nccl/pull/322.patch
+From 410d341bd4569f60282576daa5c991717dbd560e Mon Sep 17 00:00:00 2001
+From: Danilo <doak@google.com>
+Date: Tue, 14 Apr 2020 14:52:42 +0200
+Subject: [PATCH 1/2] Fix memory leak in xml.cc.
+
+This patch fixes the memory leak documented in
+https://github.com/NVIDIA/nccl/issues/321, where one of the buffers
+allocated by realpath(), inside getPciPath() is not freed.
+
+The memory management aspect of this function also seemed odd and
+unecessary, as the realpath() function is documented to only write up to
+PATH_MAX bytes to the buffer passed to it, meaning we don't need dynamic
+memory allocation at all. I also changed the function signature of
+getPciPath to enforce the use of a fixed-size buffer.
+---
+ src/graph/xml.cc | 23 ++++++++++++-----------
+ 1 file changed, 12 insertions(+), 11 deletions(-)
+
+diff --git a/src/graph/xml.cc b/src/graph/xml.cc
+index 550cfcd0c..8fea91950 100644
+--- a/src/graph/xml.cc
++++ b/src/graph/xml.cc
+@@ -323,12 +323,14 @@ ncclResult_t ncclTopoGetXmlFromFile(const char* xmlTopoFile, struct ncclXml* xml
+ static void memcpylower(char* dst, const char* src, const size_t size) {
+   for (int i=0; i<size; i++) dst[i] = tolower(src[i]);
+ }
+-static ncclResult_t getPciPath(const char* busId, char** path) {
++
++static ncclResult_t getPciPath(const char* busId, char path[PATH_MAX+1]) {
+   char busPath[] = "/sys/class/pci_bus/0000:00/../../0000:00:00.0";
+   memcpylower(busPath+sizeof("/sys/class/pci_bus/")-1, busId, BUSID_REDUCED_SIZE-1);
+   memcpylower(busPath+sizeof("/sys/class/pci_bus/0000:00/../../")-1, busId, BUSID_SIZE-1);
+-  *path = realpath(busPath, NULL);
+-  if (*path == NULL) {
++  // Ensure that the returned string will always be null-terminated;
++  path[PATH_MAX] = 0;
++  if (realpath(busPath, path) == NULL) {
+     WARN("Could not find real path of %s", busPath);
+     return ncclSystemError;
+   }
+@@ -462,16 +464,16 @@ ncclResult_t ncclTopoGetXmlFromSys(struct ncclXmlNode* pciNode, struct ncclXml*
+   // Fill info, then parent
+   const char* busId;
+   NCCLCHECK(xmlGetAttr(pciNode, "busid", &busId));
+-  char* path = NULL;
++  char path[PATH_MAX+1];
+   int index;
+   NCCLCHECK(xmlGetAttrIndex(pciNode, "class", &index));
+   if (index == -1) {
+-    if (path == NULL) NCCLCHECK(getPciPath(busId, &path));
++    NCCLCHECK(getPciPath(busId, path));
+     NCCLCHECK(ncclTopoSetAttrFromSys(pciNode, path, "class", "class"));
+   }
+   NCCLCHECK(xmlGetAttrIndex(pciNode, "link_speed", &index));
+   if (index == -1) {
+-    if (path == NULL) NCCLCHECK(getPciPath(busId, &path));
++    NCCLCHECK(getPciPath(busId, path));
+     char deviceSpeedStr[MAX_STR_LEN];
+     float deviceSpeed;
+     NCCLCHECK(ncclTopoGetStrFromSys(path, "max_link_speed", deviceSpeedStr));
+@@ -484,7 +486,7 @@ ncclResult_t ncclTopoGetXmlFromSys(struct ncclXmlNode* pciNode, struct ncclXml*
+   }
+   NCCLCHECK(xmlGetAttrIndex(pciNode, "link_width", &index));
+   if (index == -1) {
+-    if (path == NULL) NCCLCHECK(getPciPath(busId, &path));
++    NCCLCHECK(getPciPath(busId, path));
+     char strValue[MAX_STR_LEN];
+     NCCLCHECK(ncclTopoGetStrFromSys(path, "max_link_width", strValue));
+     int deviceWidth = strtol(strValue, NULL, 0);
+@@ -494,7 +496,7 @@ ncclResult_t ncclTopoGetXmlFromSys(struct ncclXmlNode* pciNode, struct ncclXml*
+   }
+   struct ncclXmlNode* parent = pciNode->parent;
+   if (parent == NULL) {
+-    if (path == NULL) NCCLCHECK(getPciPath(busId, &path));
++    NCCLCHECK(getPciPath(busId, path));
+ 
+     // Save that for later in case next step is a CPU
+     char numaIdStr[MAX_STR_LEN];
+@@ -544,7 +546,6 @@ ncclResult_t ncclTopoGetXmlFromSys(struct ncclXmlNode* pciNode, struct ncclXml*
+   } else if (strcmp(parent->name, "cpu") == 0) {
+     NCCLCHECK(ncclTopoGetXmlFromCpu(parent, xml));
+   }
+-  free(path);
+   return ncclSuccess;
+ }
+ 
+@@ -640,8 +641,8 @@ ncclResult_t ncclTopoGetXmlFromGpu(struct ncclXmlNode* pciNode, nvmlDevice_t nvm
+     if (index == -1) {
+       const char* busId;
+       NCCLCHECK(xmlGetAttr(sub, "target", &busId));
+-      char* path;
+-      NCCLCHECK(getPciPath(busId, &path));
++      char path[PATH_MAX+1];
++      NCCLCHECK(getPciPath(busId, path));
+       NCCLCHECK(ncclTopoSetAttrFromSys(sub, path, "class", "tclass"));
+     }
+   }
+
+From f02d51952ac587237ea5f7c607a5b379381d09d7 Mon Sep 17 00:00:00 2001
+From: Danilo <doak@google.com>
+Date: Tue, 14 Apr 2020 22:17:49 +0200
+Subject: [PATCH 2/2] Performance tweaks in ncclTopoGetXmlFromSys.
+
+Reduce the number of getPciPath calls to a single one per invocation
+and split the function in two so that the large `path` buffer does
+not linger the in the stack during recursive calls.
+---
+ src/graph/xml.cc | 17 +++++++++++------
+ 1 file changed, 11 insertions(+), 6 deletions(-)
+
+diff --git a/src/graph/xml.cc b/src/graph/xml.cc
+index 8fea91950..42eb68a4b 100644
+--- a/src/graph/xml.cc
++++ b/src/graph/xml.cc
+@@ -460,20 +460,21 @@ int checkBDFFormat(char* bdf) {
+   return 1;
+ }
+ 
+-ncclResult_t ncclTopoGetXmlFromSys(struct ncclXmlNode* pciNode, struct ncclXml* xml) {
++ncclResult_t ncclTopoGetXmlNodeFromSys(struct ncclXmlNode* pciNode,
++                                       struct ncclXml* xml,
++                                       struct ncclXmlNode** return_parent) {
+   // Fill info, then parent
+   const char* busId;
+   NCCLCHECK(xmlGetAttr(pciNode, "busid", &busId));
+   char path[PATH_MAX+1];
++  NCCLCHECK(getPciPath(busId, path));
+   int index;
+   NCCLCHECK(xmlGetAttrIndex(pciNode, "class", &index));
+   if (index == -1) {
+-    NCCLCHECK(getPciPath(busId, path));
+     NCCLCHECK(ncclTopoSetAttrFromSys(pciNode, path, "class", "class"));
+   }
+   NCCLCHECK(xmlGetAttrIndex(pciNode, "link_speed", &index));
+   if (index == -1) {
+-    NCCLCHECK(getPciPath(busId, path));
+     char deviceSpeedStr[MAX_STR_LEN];
+     float deviceSpeed;
+     NCCLCHECK(ncclTopoGetStrFromSys(path, "max_link_speed", deviceSpeedStr));
+@@ -486,7 +487,6 @@ ncclResult_t ncclTopoGetXmlFromSys(struct ncclXmlNode* pciNode, struct ncclXml*
+   }
+   NCCLCHECK(xmlGetAttrIndex(pciNode, "link_width", &index));
+   if (index == -1) {
+-    NCCLCHECK(getPciPath(busId, path));
+     char strValue[MAX_STR_LEN];
+     NCCLCHECK(ncclTopoGetStrFromSys(path, "max_link_width", strValue));
+     int deviceWidth = strtol(strValue, NULL, 0);
+@@ -496,8 +496,6 @@ ncclResult_t ncclTopoGetXmlFromSys(struct ncclXmlNode* pciNode, struct ncclXml*
+   }
+   struct ncclXmlNode* parent = pciNode->parent;
+   if (parent == NULL) {
+-    NCCLCHECK(getPciPath(busId, path));
+-
+     // Save that for later in case next step is a CPU
+     char numaIdStr[MAX_STR_LEN];
+     NCCLCHECK(ncclTopoGetStrFromSys(path, "numa_node", numaIdStr));
+@@ -541,6 +539,13 @@ ncclResult_t ncclTopoGetXmlFromSys(struct ncclXmlNode* pciNode, struct ncclXml*
+     pciNode->parent = parent;
+     parent->subs[parent->nSubs++] = pciNode;
+   }
++  *return_parent = parent;
++  return ncclSuccess;
++}
++
++ncclResult_t ncclTopoGetXmlFromSys(struct ncclXmlNode* pciNode, struct ncclXml* xml) {
++  struct ncclXmlNode* parent;
++  ncclTopoGetXmlNodeFromSys(pciNode, xml, &parent);
+   if (strcmp(parent->name, "pci") == 0) {
+     NCCLCHECK(ncclTopoGetXmlFromSys(parent, xml));
+   } else if (strcmp(parent->name, "cpu") == 0) {
diff --git a/third_party/nccl/build_defs.bzl.tpl b/third_party/nccl/build_defs.bzl.tpl
index 9268af7c890..b520f71d0f1 100644
--- a/third_party/nccl/build_defs.bzl.tpl
+++ b/third_party/nccl/build_defs.bzl.tpl
@@ -169,35 +169,94 @@ _device_link = rule(
 )
 """Links device code and generates source code for kernel registration."""
 
+def _prune_relocatable_code_impl(ctx):
+    """Clears __nv_relfatbin section containing relocatable device code."""
+    empty_file = ctx.actions.declare_file(ctx.attr.name + "__nv_relfatbin")
+    ctx.actions.write(empty_file, "")
+
+    # Parse 'objcopy --version' and update section if it's at least v2.26.
+    # Otherwise, simply copy the file without changing it.
+    # TODO(csigg): version parsing is brittle, can we do better?
+    command = r"""
+        objcopy=$1                                         \
+        section=$2                                         \
+        input=$3                                           \
+        output=$4                                          \
+        args=""                                            \
+        pattern='([0-9])\.([0-9]+)';                       \
+        if [[ $($objcopy --version) =~ $pattern ]] && {    \
+            [ ${BASH_REMATCH[1]} -gt 2 ] ||                \
+            [ ${BASH_REMATCH[2]} -ge 26 ]; }; then         \
+          args="--update-section __nv_relfatbin=$section"; \
+        fi;                                                \
+        $objcopy $args $input $output
+    """
+    cc_toolchain = find_cpp_toolchain(ctx)
+    outputs = []
+    for src in ctx.files.srcs:
+        out = ctx.actions.declare_file("pruned_" + src.basename, sibling = src)
+        ctx.actions.run_shell(
+            inputs = [empty_file] + ctx.files.srcs,  # + ctx.files._crosstool,
+            outputs = [out],
+            arguments = [
+                cc_toolchain.objcopy_executable,
+                empty_file.path,
+                src.path,
+                out.path,
+            ],
+            command = command,
+        )
+        outputs.append(out)
+    return DefaultInfo(files = depset(outputs))
+
+_prune_relocatable_code = rule(
+    implementation = _prune_relocatable_code_impl,
+    attrs = {
+        "srcs": attr.label_list(mandatory = True, allow_files = True),
+        "_cc_toolchain": attr.label(
+            default = "@bazel_tools//tools/cpp:current_cc_toolchain",
+        ),
+        # "_crosstool": attr.label_list(
+        #     cfg = "host",
+        #     default = ["@bazel_tools//tools/cpp:crosstool"]
+        # ),
+    },
+)
+
 def _merge_archive_impl(ctx):
     # Generate an mri script to the merge archives in srcs and pass it to 'ar'.
     # See https://stackoverflow.com/a/23621751.
     files = _pic_only(ctx.files.srcs)
     mri_script = "create " + ctx.outputs.out.path
     for f in files:
-        mri_script += "\\naddlib " + f.path
-    mri_script += "\\nsave\\nend"
+        mri_script += r"\naddlib " + f.path
+    mri_script += r"\nsave\nend"
 
     cc_toolchain = find_cpp_toolchain(ctx)
     ctx.actions.run_shell(
         inputs = ctx.files.srcs,  # + ctx.files._crosstool,
         outputs = [ctx.outputs.out],
-        command = "printf \"%s\" | %s -M" % (mri_script, cc_toolchain.ar_executable),
+        command = "echo -e \"%s\" | %s -M" % (mri_script, cc_toolchain.ar_executable),
     )
 
 _merge_archive = rule(
     implementation = _merge_archive_impl,
     attrs = {
         "srcs": attr.label_list(mandatory = True, allow_files = True),
-        "_cc_toolchain": attr.label(default = "@bazel_tools//tools/cpp:current_cc_toolchain"),
-        # "_crosstool": attr.label_list(cfg = "host", default = ["@bazel_tools//tools/cpp:crosstool"]),
+        "_cc_toolchain": attr.label(
+            default = "@bazel_tools//tools/cpp:current_cc_toolchain",
+        ),
+        # "_crosstool": attr.label_list(
+        #     cfg = "host",
+        #     default = ["@bazel_tools//tools/cpp:crosstool"]
+        # ),
     },
     outputs = {"out": "lib%{name}.a"},
 )
 """Merges srcs into a single archive."""
 
 def cuda_rdc_library(name, hdrs = None, copts = None, linkstatic = True, **kwargs):
-    """Produces a cuda_library using separate compilation and linking.
+    r"""Produces a cuda_library using separate compilation and linking.
 
     CUDA separate compilation and linking allows device function calls across
     translation units. This is different from the normal whole program
@@ -239,17 +298,24 @@ def cuda_rdc_library(name, hdrs = None, copts = None, linkstatic = True, **kwarg
 
     The steps marked with '*' are implemented in the _device_link rule.
 
+    The intermediate relocatable device code in xy.a is no longer needed at
+    this point and the corresponding section is replaced with an empty one using
+    objcopy. We do not remove the section completely because it is referenced by
+    relocations, and removing those as well breaks fatbin registration.
+
     The object files in both xy.a and dlink.a reference symbols defined in the
     other archive. The separate archives are a side effect of using two
     cc_library targets to implement a single compilation trajectory. We could
     fix this once bazel supports C++ sandwich. For now, we just merge the two
     archives to avoid unresolved symbols:
 
-    xy.a      dlink.a
-        \    /           merge archive
-      xy_dlink.a
-           |             cc_library (or alternatively, cc_import)
-     final target
+                    xy.a
+                     |         objcopy --update-section __nv_relfatbin=''
+    dlink.a     xy_pruned.a
+         \           /         merge archive
+          xy_merged.a
+              |                cc_library (or alternatively, cc_import)
+         final target
 
     Another complication is that cc_library produces (depending on the
     configuration) both PIC and non-PIC archives, but the distinction
@@ -313,19 +379,26 @@ def cuda_rdc_library(name, hdrs = None, copts = None, linkstatic = True, **kwarg
         linkstatic = linkstatic,
     )
 
+    # Remove intermediate relocatable device code.
+    pruned = name + "_pruned"
+    _prune_relocatable_code(
+        name = pruned,
+        srcs = [lib],
+    )
+
     # Repackage the two libs into a single archive. This is required because
     # both libs reference symbols defined in the other one. For details, see
     # https://eli.thegreenplace.net/2013/07/09/library-order-in-static-linking
-    archive = name + "_a"
+    merged = name + "_merged"
     _merge_archive(
-        name = archive,
-        srcs = [lib, dlink],
+        name = merged,
+        srcs = [pruned, dlink],
     )
 
     # Create cc target from archive.
     native.cc_library(
         name = name,
-        srcs = [archive],
+        srcs = [merged],
         hdrs = hdrs,
         linkstatic = linkstatic,
     )
diff --git a/third_party/py/numpy/LICENSE b/third_party/py/numpy/LICENSE
new file mode 100644
index 00000000000..b9731f734f5
--- /dev/null
+++ b/third_party/py/numpy/LICENSE
@@ -0,0 +1,60 @@
+Copyright (c) 2005-2019, NumPy Developers.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+    * Redistributions of source code must retain the above copyright
+       notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above
+       copyright notice, this list of conditions and the following
+       disclaimer in the documentation and/or other materials provided
+       with the distribution.
+
+    * Neither the name of the NumPy Developers nor the names of any
+       contributors may be used to endorse or promote products derived
+       from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+
+The NumPy repository and source distributions bundle several libraries that are
+compatibly licensed.  We list these here.
+
+Name: Numpydoc
+Files: doc/sphinxext/numpydoc/*
+License: 2-clause BSD
+  For details, see doc/sphinxext/LICENSE.txt
+
+Name: scipy-sphinx-theme
+Files: doc/scipy-sphinx-theme/*
+License: 3-clause BSD, PSF and Apache 2.0
+  For details, see doc/scipy-sphinx-theme/LICENSE.txt
+
+Name: lapack-lite
+Files: numpy/linalg/lapack_lite/*
+License: 3-clause BSD
+  For details, see numpy/linalg/lapack_lite/LICENSE.txt
+
+Name: tempita
+Files: tools/npy_tempita/*
+License: BSD derived
+  For details, see tools/npy_tempita/license.txt
+
+Name: dragon4
+Files: numpy/core/src/multiarray/dragon4.c
+License: One of a kind
+  For license text, see numpy/core/src/multiarray/dragon4.c
diff --git a/third_party/py/numpy/README.md b/third_party/py/numpy/README.md
new file mode 100644
index 00000000000..4e58b9df87b
--- /dev/null
+++ b/third_party/py/numpy/README.md
@@ -0,0 +1,4 @@
+# numpy_ops
+
+The folder tf_numpy_api/ contains lists of NumPy API symbols that the
+`numpy_ops` internal module in TensorFlow implements.
diff --git a/third_party/py/numpy/tf_numpy_api/numpy_ops.ndarray.pbtxt b/third_party/py/numpy/tf_numpy_api/numpy_ops.ndarray.pbtxt
new file mode 100644
index 00000000000..8492a30d81b
--- /dev/null
+++ b/third_party/py/numpy/tf_numpy_api/numpy_ops.ndarray.pbtxt
@@ -0,0 +1,58 @@
+path: "numpy_ops.ndarray"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.numpy_ops.np_arrays.ndarray\'>"
+  is_instance: "<class \'tensorflow.python.framework.composite_tensor.CompositeTensor\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "T"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "data"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "ndim"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "size"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'shape\', \'dtype\', \'buffer\'], varargs=None, keywords=None, defaults=[\"<class \'float\'>\", \'None\'], "
+  }
+  member_method {
+    name: "astype"
+    argspec: "args=[\'self\', \'dtype\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_tensor"
+    argspec: "args=[\'cls\', \'tensor\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ravel"
+    argspec: "args=[\'a\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reshape"
+    argspec: "args=[\'a\'], varargs=newshape, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "tolist"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "transpose"
+    argspec: "args=[\'a\', \'axes\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/third_party/py/numpy/tf_numpy_api/numpy_ops.pbtxt b/third_party/py/numpy/tf_numpy_api/numpy_ops.pbtxt
new file mode 100644
index 00000000000..30913665f14
--- /dev/null
+++ b/third_party/py/numpy/tf_numpy_api/numpy_ops.pbtxt
@@ -0,0 +1,903 @@
+path: "numpy_ops"
+tf_module {
+  member {
+    name: "bool_"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "complex128"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "complex64"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "complex_"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "e"
+    mtype: "<class \'float\'>"
+  }
+  member {
+    name: "float16"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "float32"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "float64"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "float_"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "iinfo"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "inexact"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "inf"
+    mtype: "<class \'float\'>"
+  }
+  member {
+    name: "int16"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "int32"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "int64"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "int8"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "int_"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "ndarray"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "object_"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "pi"
+    mtype: "<class \'float\'>"
+  }
+  member {
+    name: "random"
+    mtype: "<type \'module\'>"
+  }
+  member {
+    name: "string_"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "uint16"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "uint32"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "uint64"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "uint8"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "unicode_"
+    mtype: "<type \'type\'>"
+  }
+  member_method {
+    name: "abs"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "absolute"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "add"
+    argspec: "args=[\'x1\', \'x2\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "all"
+    argspec: "args=[\'a\', \'axis\', \'keepdims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "allclose"
+    argspec: "args=[\'a\', \'b\', \'rtol\', \'atol\', \'equal_nan\'], varargs=None, keywords=None, defaults=[\'1e-05\', \'1e-08\', \'False\'], "
+  }
+  member_method {
+    name: "amax"
+    argspec: "args=[\'a\', \'axis\', \'keepdims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "amin"
+    argspec: "args=[\'a\', \'axis\', \'keepdims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "angle"
+    argspec: "args=[\'z\', \'deg\'], varargs=None, keywords=None, defaults=[\'False\'], "
+  }
+  member_method {
+    name: "any"
+    argspec: "args=[\'a\', \'axis\', \'keepdims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "append"
+    argspec: "args=[\'arr\', \'values\', \'axis\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "arange"
+    argspec: "args=[\'start\', \'stop\', \'step\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'None\'], "
+  }
+  member_method {
+    name: "arccos"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "arccosh"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "arcsin"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "arcsinh"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "arctan"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "arctan2"
+    argspec: "args=[\'x1\', \'x2\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "arctanh"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "argmax"
+    argspec: "args=[\'a\', \'axis\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "argmin"
+    argspec: "args=[\'a\', \'axis\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "argsort"
+    argspec: "args=[\'a\', \'axis\', \'kind\', \'order\'], varargs=None, keywords=None, defaults=[\'-1\', \'quicksort\', \'None\'], "
+  }
+  member_method {
+    name: "around"
+    argspec: "args=[\'a\', \'decimals\'], varargs=None, keywords=None, defaults=[\'0\'], "
+  }
+  member_method {
+    name: "array"
+    argspec: "args=[\'val\', \'dtype\', \'copy\', \'ndmin\'], varargs=None, keywords=None, defaults=[\'None\', \'True\', \'0\'], "
+  }
+  member_method {
+    name: "array_equal"
+    argspec: "args=[\'a1\', \'a2\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "asanyarray"
+    argspec: "args=[\'a\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "asarray"
+    argspec: "args=[\'a\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ascontiguousarray"
+    argspec: "args=[\'a\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "atleast_1d"
+    argspec: "args=[], varargs=arys, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "atleast_2d"
+    argspec: "args=[], varargs=arys, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "atleast_3d"
+    argspec: "args=[], varargs=arys, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "average"
+    argspec: "args=[\'a\', \'axis\', \'weights\', \'returned\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\'], "
+  }
+  member_method {
+    name: "bitwise_and"
+    argspec: "args=[\'x1\', \'x2\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "bitwise_not"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "bitwise_or"
+    argspec: "args=[\'x1\', \'x2\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "bitwise_xor"
+    argspec: "args=[\'x1\', \'x2\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "broadcast_arrays"
+    argspec: "args=[], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "broadcast_to"
+    argspec: "args=[\'array\', \'shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "cbrt"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ceil"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "clip"
+    argspec: "args=[\'a\', \'a_min\', \'a_max\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compress"
+    argspec: "args=[\'condition\', \'a\', \'axis\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "concatenate"
+    argspec: "args=[\'arys\', \'axis\'], varargs=None, keywords=None, defaults=[\'0\'], "
+  }
+  member_method {
+    name: "conj"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "conjugate"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "copy"
+    argspec: "args=[\'a\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "cos"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "cosh"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_nonzero"
+    argspec: "args=[\'a\', \'axis\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "cross"
+    argspec: "args=[\'a\', \'b\', \'axisa\', \'axisb\', \'axisc\', \'axis\'], varargs=None, keywords=None, defaults=[\'-1\', \'-1\', \'-1\', \'None\'], "
+  }
+  member_method {
+    name: "cumprod"
+    argspec: "args=[\'a\', \'axis\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "cumsum"
+    argspec: "args=[\'a\', \'axis\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "deg2rad"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "diag"
+    argspec: "args=[\'v\', \'k\'], varargs=None, keywords=None, defaults=[\'0\'], "
+  }
+  member_method {
+    name: "diag_indices"
+    argspec: "args=[\'n\', \'ndim\'], varargs=None, keywords=None, defaults=[\'2\'], "
+  }
+  member_method {
+    name: "diagflat"
+    argspec: "args=[\'v\', \'k\'], varargs=None, keywords=None, defaults=[\'0\'], "
+  }
+  member_method {
+    name: "diagonal"
+    argspec: "args=[\'a\', \'offset\', \'axis1\', \'axis2\'], varargs=None, keywords=None, defaults=[\'0\', \'0\', \'1\'], "
+  }
+  member_method {
+    name: "diff"
+    argspec: "args=[\'a\', \'n\', \'axis\'], varargs=None, keywords=None, defaults=[\'1\', \'-1\'], "
+  }
+  member_method {
+    name: "divide"
+    argspec: "args=[\'x1\', \'x2\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "divmod"
+    argspec: "args=[\'x1\', \'x2\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "dot"
+    argspec: "args=[\'a\', \'b\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "dsplit"
+    argspec: "args=[\'ary\', \'indices_or_sections\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "dstack"
+    argspec: "args=[\'tup\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "einsum"
+    argspec: "args=[\'subscripts\'], varargs=operands, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "empty"
+    argspec: "args=[\'shape\', \'dtype\'], varargs=None, keywords=None, defaults=[\"<class \'float\'>\"], "
+  }
+  member_method {
+    name: "empty_like"
+    argspec: "args=[\'a\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "equal"
+    argspec: "args=[\'x1\', \'x2\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "exp"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "exp2"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "expand_dims"
+    argspec: "args=[\'a\', \'axis\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "expm1"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "eye"
+    argspec: "args=[\'N\', \'M\', \'k\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \"<class \'float\'>\"], "
+  }
+  member_method {
+    name: "fabs"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "finfo"
+    argspec: "args=[\'dtype\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "fix"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "flip"
+    argspec: "args=[\'m\', \'axis\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "fliplr"
+    argspec: "args=[\'m\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "flipud"
+    argspec: "args=[\'m\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "float_power"
+    argspec: "args=[\'x1\', \'x2\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "floor"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "floor_divide"
+    argspec: "args=[\'x1\', \'x2\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "full"
+    argspec: "args=[\'shape\', \'fill_value\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "full_like"
+    argspec: "args=[\'a\', \'fill_value\', \'dtype\', \'order\', \'subok\', \'shape\'], varargs=None, keywords=None, defaults=[\'None\', \'K\', \'True\', \'None\'], "
+  }
+  member_method {
+    name: "gcd"
+    argspec: "args=[\'x1\', \'x2\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "geomspace"
+    argspec: "args=[\'start\', \'stop\', \'num\', \'endpoint\', \'dtype\', \'axis\'], varargs=None, keywords=None, defaults=[\'50\', \'True\', \'None\', \'0\'], "
+  }
+  member_method {
+    name: "greater"
+    argspec: "args=[\'x1\', \'x2\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "greater_equal"
+    argspec: "args=[\'x1\', \'x2\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "heaviside"
+    argspec: "args=[\'x1\', \'x2\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "hsplit"
+    argspec: "args=[\'ary\', \'indices_or_sections\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "hstack"
+    argspec: "args=[\'tup\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "hypot"
+    argspec: "args=[\'x1\', \'x2\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "identity"
+    argspec: "args=[\'n\', \'dtype\'], varargs=None, keywords=None, defaults=[\"<class \'float\'>\"], "
+  }
+  member_method {
+    name: "imag"
+    argspec: "args=[\'a\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "inner"
+    argspec: "args=[\'a\', \'b\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "isclose"
+    argspec: "args=[\'a\', \'b\', \'rtol\', \'atol\', \'equal_nan\'], varargs=None, keywords=None, defaults=[\'1e-05\', \'1e-08\', \'False\'], "
+  }
+  member_method {
+    name: "iscomplex"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "iscomplexobj"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "isfinite"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "isinf"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "isnan"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "isneginf"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "isposinf"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "isreal"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "isrealobj"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "isscalar"
+    argspec: "args=[\'a\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "issubdtype"
+    argspec: "args=[\'arg1\', \'arg2\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ix_"
+    argspec: "args=[], varargs=args, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "kron"
+    argspec: "args=[\'a\', \'b\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "lcm"
+    argspec: "args=[\'x1\', \'x2\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "less"
+    argspec: "args=[\'x1\', \'x2\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "less_equal"
+    argspec: "args=[\'x1\', \'x2\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "linspace"
+    argspec: "args=[\'start\', \'stop\', \'num\', \'endpoint\', \'retstep\', \'dtype\', \'axis\'], varargs=None, keywords=None, defaults=[\'50\', \'True\', \'False\', \"<class \'float\'>\", \'0\'], "
+  }
+  member_method {
+    name: "log"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "log10"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "log1p"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "log2"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "logaddexp"
+    argspec: "args=[\'x1\', \'x2\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "logaddexp2"
+    argspec: "args=[\'x1\', \'x2\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "logical_and"
+    argspec: "args=[\'x1\', \'x2\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "logical_not"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "logical_or"
+    argspec: "args=[\'x1\', \'x2\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "logical_xor"
+    argspec: "args=[\'x1\', \'x2\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "logspace"
+    argspec: "args=[\'start\', \'stop\', \'num\', \'endpoint\', \'base\', \'dtype\', \'axis\'], varargs=None, keywords=None, defaults=[\'50\', \'True\', \'10.0\', \'None\', \'0\'], "
+  }
+  member_method {
+    name: "matmul"
+    argspec: "args=[\'x1\', \'x2\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "max"
+    argspec: "args=[\'a\', \'axis\', \'keepdims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "maximum"
+    argspec: "args=[\'x1\', \'x2\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "mean"
+    argspec: "args=[\'a\', \'axis\', \'dtype\', \'keepdims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "meshgrid"
+    argspec: "args=[], varargs=xi, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "min"
+    argspec: "args=[\'a\', \'axis\', \'keepdims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "minimum"
+    argspec: "args=[\'x1\', \'x2\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "mod"
+    argspec: "args=[\'x1\', \'x2\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "moveaxis"
+    argspec: "args=[\'a\', \'source\', \'destination\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "multiply"
+    argspec: "args=[\'x1\', \'x2\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "nanmean"
+    argspec: "args=[\'a\', \'axis\', \'dtype\', \'keepdims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "nanprod"
+    argspec: "args=[\'a\', \'axis\', \'dtype\', \'keepdims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\'], "
+  }
+  member_method {
+    name: "nansum"
+    argspec: "args=[\'a\', \'axis\', \'dtype\', \'keepdims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\'], "
+  }
+  member_method {
+    name: "ndim"
+    argspec: "args=[\'a\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "negative"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "nextafter"
+    argspec: "args=[\'x1\', \'x2\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "nonzero"
+    argspec: "args=[\'a\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "not_equal"
+    argspec: "args=[\'x1\', \'x2\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ones"
+    argspec: "args=[\'shape\', \'dtype\'], varargs=None, keywords=None, defaults=[\"<class \'float\'>\"], "
+  }
+  member_method {
+    name: "ones_like"
+    argspec: "args=[\'a\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "outer"
+    argspec: "args=[\'a\', \'b\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "pad"
+    argspec: "args=[\'ary\', \'pad_width\', \'mode\', \'constant_values\'], varargs=None, keywords=None, defaults=[\'0\'], "
+  }
+  member_method {
+    name: "polyval"
+    argspec: "args=[\'p\', \'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "positive"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "power"
+    argspec: "args=[\'x1\', \'x2\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "prod"
+    argspec: "args=[\'a\', \'axis\', \'dtype\', \'keepdims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "promote_types"
+    argspec: "args=[\'type1\', \'type2\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ptp"
+    argspec: "args=[\'a\', \'axis\', \'keepdims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "rad2deg"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ravel"
+    argspec: "args=[\'a\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "real"
+    argspec: "args=[\'val\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reciprocal"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "remainder"
+    argspec: "args=[\'x1\', \'x2\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "repeat"
+    argspec: "args=[\'a\', \'repeats\', \'axis\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "reshape"
+    argspec: "args=[\'a\', \'newshape\', \'order\'], varargs=None, keywords=None, defaults=[\'C\'], "
+  }
+  member_method {
+    name: "result_type"
+    argspec: "args=[], varargs=arrays_and_dtypes, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "roll"
+    argspec: "args=[\'a\', \'shift\', \'axis\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "rot90"
+    argspec: "args=[\'m\', \'k\', \'axes\'], varargs=None, keywords=None, defaults=[\'1\', \'(0, 1)\'], "
+  }
+  member_method {
+    name: "round"
+    argspec: "args=[\'a\', \'decimals\'], varargs=None, keywords=None, defaults=[\'0\'], "
+  }
+  member_method {
+    name: "select"
+    argspec: "args=[\'condlist\', \'choicelist\', \'default\'], varargs=None, keywords=None, defaults=[\'0\'], "
+  }
+  member_method {
+    name: "shape"
+    argspec: "args=[\'a\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "sign"
+    argspec: "args=[\'x\', \'out\', \'where\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "signbit"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "sin"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "sinc"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "sinh"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "sort"
+    argspec: "args=[\'a\', \'axis\', \'kind\', \'order\'], varargs=None, keywords=None, defaults=[\'-1\', \'quicksort\', \'None\'], "
+  }
+  member_method {
+    name: "split"
+    argspec: "args=[\'ary\', \'indices_or_sections\', \'axis\'], varargs=None, keywords=None, defaults=[\'0\'], "
+  }
+  member_method {
+    name: "sqrt"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "square"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "squeeze"
+    argspec: "args=[\'a\', \'axis\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "stack"
+    argspec: "args=[\'arrays\', \'axis\'], varargs=None, keywords=None, defaults=[\'0\'], "
+  }
+  member_method {
+    name: "std"
+    argspec: "args=[\'a\', \'axis\', \'keepdims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "subtract"
+    argspec: "args=[\'x1\', \'x2\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "sum"
+    argspec: "args=[\'a\', \'axis\', \'dtype\', \'keepdims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "swapaxes"
+    argspec: "args=[\'a\', \'axis1\', \'axis2\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "take"
+    argspec: "args=[\'a\', \'indices\', \'axis\', \'out\', \'mode\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'clip\'], "
+  }
+  member_method {
+    name: "take_along_axis"
+    argspec: "args=[\'arr\', \'indices\', \'axis\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "tan"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "tanh"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "tensordot"
+    argspec: "args=[\'a\', \'b\', \'axes\'], varargs=None, keywords=None, defaults=[\'2\'], "
+  }
+  member_method {
+    name: "tile"
+    argspec: "args=[\'a\', \'reps\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "trace"
+    argspec: "args=[\'a\', \'offset\', \'axis1\', \'axis2\', \'dtype\'], varargs=None, keywords=None, defaults=[\'0\', \'0\', \'1\', \'None\'], "
+  }
+  member_method {
+    name: "transpose"
+    argspec: "args=[\'a\', \'axes\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "tri"
+    argspec: "args=[\'N\', \'M\', \'k\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'None\'], "
+  }
+  member_method {
+    name: "tril"
+    argspec: "args=[\'m\', \'k\'], varargs=None, keywords=None, defaults=[\'0\'], "
+  }
+  member_method {
+    name: "triu"
+    argspec: "args=[\'m\', \'k\'], varargs=None, keywords=None, defaults=[\'0\'], "
+  }
+  member_method {
+    name: "true_divide"
+    argspec: "args=[\'x1\', \'x2\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "vander"
+    argspec: "args=[\'x\', \'N\', \'increasing\'], varargs=None, keywords=None, defaults=[\'None\', \'False\'], "
+  }
+  member_method {
+    name: "var"
+    argspec: "args=[\'a\', \'axis\', \'dtype\', \'out\', \'ddof\', \'keepdims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'0\', \'None\'], "
+  }
+  member_method {
+    name: "vdot"
+    argspec: "args=[\'a\', \'b\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "vsplit"
+    argspec: "args=[\'ary\', \'indices_or_sections\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "vstack"
+    argspec: "args=[\'tup\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "where"
+    argspec: "args=[\'condition\', \'x\', \'y\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "zeros"
+    argspec: "args=[\'shape\', \'dtype\'], varargs=None, keywords=None, defaults=[\"<class \'float\'>\"], "
+  }
+  member_method {
+    name: "zeros_like"
+    argspec: "args=[\'a\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/third_party/py/numpy/tf_numpy_api/numpy_ops.random.pbtxt b/third_party/py/numpy/tf_numpy_api/numpy_ops.random.pbtxt
new file mode 100644
index 00000000000..3e6bb720e8c
--- /dev/null
+++ b/third_party/py/numpy/tf_numpy_api/numpy_ops.random.pbtxt
@@ -0,0 +1,27 @@
+path: "numpy_ops.random"
+tf_module {
+  member_method {
+    name: "rand"
+    argspec: "args=[], varargs=size, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "randint"
+    argspec: "args=[\'low\', \'high\', \'size\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \"<class \'int\'>\"], "
+  }
+  member_method {
+    name: "randn"
+    argspec: "args=[], varargs=args, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "random"
+    argspec: "args=[\'size\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "seed"
+    argspec: "args=[\'s\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "uniform"
+    argspec: "args=[\'low\', \'high\', \'size\'], varargs=None, keywords=None, defaults=[\'0.0\', \'1.0\', \'None\'], "
+  }
+}
diff --git a/third_party/toolchains/preconfig/generate/containers.bzl b/third_party/toolchains/preconfig/generate/containers.bzl
index 4a3ed0b5225..260b7c31717 100644
--- a/third_party/toolchains/preconfig/generate/containers.bzl
+++ b/third_party/toolchains/preconfig/generate/containers.bzl
@@ -11,6 +11,7 @@ container_digests = {
     "cuda10.1-cudnn7-ubuntu16.04-manylinux2010": "sha256:5e6d21c8ef226316eb6df5e2e6015244c16a8e5d936b52a09820442d2f8a919f",
     "cuda10.1-cudnn7-ubuntu16.04-manylinux2010-multipython": "sha256:3f890a951c81a201d60d0161a56ce628a90323be0c7f795550caa37f6f41a85c",
     "cuda10.1-cudnn7-ubuntu18.04-manylinux2010-multipython": "sha256:bd7666d1ef49b2b2e2a64981f1c9234deeccdb0d5198b30ff4289c3dfcffedbf",
+    "cuda11.0-cudnn8-ubuntu18.04-manylinux2010-multipython": "sha256:b52edb4e35c780334ba417b008927722ae668847715a1624e9b2984e99c05338",
     "rocm-ubuntu16.04": "sha256:e645447dd6127325f3e97b8bf23424f637a8579d963b34fcc6772cf7cfaa0ebe",
     "windows-1803": "sha256:f109576c7c0c8a1783ff22b666e8923b52dbbe7933f69a1c7a7275202c304a12",
 }
diff --git a/third_party/toolchains/remote_config/configs.bzl b/third_party/toolchains/remote_config/configs.bzl
index 3bbf99e0e36..501759dcd50 100644
--- a/third_party/toolchains/remote_config/configs.bzl
+++ b/third_party/toolchains/remote_config/configs.bzl
@@ -52,6 +52,19 @@ def initialize_rbe_configs():
         python_install_path = "/usr/local",
     )
 
+    tensorflow_rbe_config(
+        name = "ubuntu18.04-gcc7_manylinux2010-cuda11.0-cudnn8-tensorrt7.1",
+        compiler = "/dt7/usr/bin/gcc",
+        compiler_prefix = "/usr/bin",
+        cuda_version = "11.0",
+        cudnn_version = "8",
+        os = "ubuntu18.04-manylinux2010-multipython",
+        python_versions = ["2.7", "3.5", "3.6", "3.7", "3.8"],
+        tensorrt_install_path = "/usr",
+        tensorrt_version = "7.1",
+        python_install_path = "/usr/local",
+    )
+
     # TODO(klimek): Delete this once all users are migrated to a python-version
     # independent configuration. In the future, use
     # "ubuntu16.04-gcc7_manylinux2010-cuda10.1-cudnn7-tensorrt6.0" instead.
diff --git a/third_party/toolchains/remote_config/containers.bzl b/third_party/toolchains/remote_config/containers.bzl
index 8f6dae7f311..d94bce91675 100644
--- a/third_party/toolchains/remote_config/containers.bzl
+++ b/third_party/toolchains/remote_config/containers.bzl
@@ -38,6 +38,13 @@ containers = {
         "digest": container_digests["cuda10.1-cudnn7-ubuntu18.04-manylinux2010-multipython"],
     },
 
+    # Built with //tensorflow/tools/ci_build/Dockerfile.rbe.cuda11.0-cudnn8-ubuntu18.04-manylinux2010-multipython.
+    "cuda11.0-cudnn8-ubuntu18.04-manylinux2010-multipython": {
+        "registry": "gcr.io",
+        "repository": "tensorflow-testing/nosla-cuda11.0-cudnn8-ubuntu18.04-manylinux2010-multipython",
+        "digest": container_digests["cuda11.0-cudnn8-ubuntu18.04-manylinux2010-multipython"],
+    },
+
     # Built with //tensorflow/tools/ci_build/Dockerfile.rbe.rocm-ubuntu16.04
     "rocm-ubuntu16.04": {
         "registry": "gcr.io",